# Data Engineering
This notebook will contain projects and scripts related to data engineering.

# File Handling Examples
## Text File Handling

In [None]:
with open('example.txt', 'w') as file:
    file.write('Hello, world!')

with open('example.txt', 'r') as file:
    content = file.read()
    print(content)

## JSON File Handling

In [None]:
import json

example_data = {'name': 'John', 'age': 30, 'city': 'New York'}

with open('example.json', 'w') as json_file:
    json.dump(example_data, json_file)

with open('example.json', 'r') as json_file:
    data = json.load(json_file)
    print(data)

## CSV File Handling

In [None]:
import csv

header = ['Name', 'Age', 'City']
data = [['John', 30, 'New York'], ['Anna', 22, 'London'], ['Mike', 32, 'San Francisco']]

with open('example.csv', 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header)
    writer.writerows(data)

with open('example.csv', 'r') as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        print(row)

# Error and Exception Handling

In [None]:
try:
    result = 10 / 0
except ZeroDivisionError as e:
    print(f"Error: {e}")
finally:
    print("This will always execute.")

# Practical Application, OOP, Decorators, and Generators
## Object-Oriented Programming

In [None]:
class Dog:
    def __init__(self, name, age):
        self.name = name
        self.age = age

    def bark(self):
        return f"{self.name} says woof!"

my_dog = Dog('Buddy', 3)
print(my_dog.bark())

## Decorators

In [None]:
def my_decorator(func):
    def wrapper():
        print("Something is happening before the function is called.")
        func()
        print("Something is happening after the function is called.")
    return wrapper

@my_decorator
def say_hello():
    print("Hello!")

say_hello()

## Generators

In [None]:
def my_generator():
    yield 1
    yield 2
    yield 3

for value in my_generator():
    print(value)

# Introduction to Data Engineering

In [None]:
print("This is a data engineering example.")

# Pandas for Data Analysis

In [None]:
import pandas as pd

data = {'Name': ['John', 'Anna', 'Mike'], 'Age': [30, 22, 32], 'City': ['New York', 'London', 'San Francisco']}
df = pd.DataFrame(data)
print(df)

# Additional Pandas Examples
## Creating DataFrame from a Dictionary

In [None]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35], 'City': ['Paris', 'Berlin', 'London']}
df = pd.DataFrame(data)
print(df)

## Reading a CSV File

In [None]:
csv_df = pd.read_csv('example.csv')
print(csv_df)

## DataFrame Operations

In [None]:
print(df.head())  # First 5 rows
print(df.describe())  # Summary statistics
print(df['Name'])  # Selecting a column

## Grouping and Aggregation

In [None]:
import pandas as pd

data = {'Name': ['John', 'Anna', 'Mike', 'John'], 'Age': [30, 22, 32, 30], 'City': ['New York', 'London', 'San Francisco', 'New York']}
df = pd.DataFrame(data)
print(df.groupby('Name').mean())

## Merging DataFrames

In [None]:
left = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], 'value': [1, 2, 3, 4]})
right = pd.DataFrame({'key': ['B', 'D', 'E', 'F'], 'value': [5, 6, 7, 8]})
merged_df = pd.merge(left, right, on='key', how='inner')
print(merged_df)

# Additional Pandas Functions
## Handling Missing Data

In [None]:
import pandas as pd

data = {'Name': ['John', 'Anna', 'Mike', 'Sara'], 'Age': [30, 22, None, 28], 'City': ['New York', 'London', 'San Francisco', None]}
df = pd.DataFrame(data)
print(df)

# Fill missing values
df_filled = df.fillna({'Age': df['Age'].mean(), 'City': 'Unknown'})
print(df_filled)

# Drop rows with missing values
df_dropped = df.dropna()
print(df_dropped)

## Sorting Data

In [None]:
import pandas as pd

data = {'Name': ['John', 'Anna', 'Mike', 'Sara'], 'Age': [30, 22, 32, 28], 'City': ['New York', 'London', 'San Francisco', 'Paris']}
df = pd.DataFrame(data)
print(df)

# Sort by Age
df_sorted = df.sort_values(by='Age')
print(df_sorted)

# Sort by Name in descending order
df_sorted_desc = df.sort_values(by='Name', ascending=False)
print(df_sorted_desc)

# NumPy for Numerical Computing

In [None]:
import numpy as np

array = np.array([1, 2, 3, 4, 5])
print(array)

# Additional NumPy Examples
## Creating Arrays

In [None]:
import numpy as np

array_1d = np.array([1, 2, 3, 4, 5])
array_2d = np.array([[1, 2, 3], [4, 5, 6]])
print(array_1d)
print(array_2d)

## Array Operations

In [None]:
print(np.mean(array_1d))  # Mean
print(np.sum(array_2d))  # Sum

# PySpark Basics

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('example').getOrCreate()
data = [('John', 30), ('Anna', 22), ('Mike', 32)]
columns = ['Name', 'Age']
df = spark.createDataFrame(data, columns)
df.show()

# Additional PySpark Examples
## Creating a Spark DataFrame

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('example').getOrCreate()
data = [('Alice', 25), ('Bob', 30), ('Charlie', 35)]
columns = ['Name', 'Age']
df = spark.createDataFrame(data, columns)
df.show()

## DataFrame Operations

In [None]:
print(df.count())  # Number of rows
print(df.columns)  # Column names

## Filtering Data

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('example').getOrCreate()
data = [('John', 30), ('Anna', 22), ('Mike', 32)]
columns = ['Name', 'Age']
df = spark.createDataFrame(data, columns)
df_filtered = df.filter(df.Age > 25)
df_filtered.show()

## Joining DataFrames

In [None]:
left = spark.createDataFrame([('A', 1), ('B', 2), ('C', 3)], ['key', 'value'])
right = spark.createDataFrame([('B', 5), ('D', 6), ('E', 7)], ['key', 'value'])
joined_df = left.join(right, on='key', how='inner')
joined_df.show()

# Additional PySpark Functions
## Grouping and Aggregation

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder.appName('example').getOrCreate()
data = [('John', 30), ('Anna', 22), ('Mike', 32), ('John', 30)]
columns = ['Name', 'Age']
df = spark.createDataFrame(data, columns)
df_grouped = df.groupBy('Name').agg(avg('Age').alias('Average_Age'))
df_grouped.show()

# DataFrames and SQL Queries

In [None]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
df.createOrReplaceTempView('people')
sql_df = sqlContext.sql('SELECT * FROM people')
sql_df.show()

# Additional DataFrames and SQL Queries Examples
## SQL Queries

In [None]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
df.createOrReplaceTempView('people')
sql_df = sqlContext.sql('SELECT Name, Age FROM people WHERE Age > 25')
sql_df.show()