# This file documents the cleaning processes we will employ for our datasets

## Adapted from (Data Camp)[https://www.datacamp.com/tutorial/guide-to-data-cleaning-in-python]

## 1. Import necessary libraries & modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 2. Load the data

In [6]:
# For CSV's:
df_for_cleaning = pd.read_csv('your_dataset.csv')

# For Excel files:
df_for_cleaning = pd.read_excel('your_dataset.xlsx')

# Example for specifying a sheet by name or index
df_for_cleaning = pd.read_excel('your_dataset.xlsx', sheet_name='Sheet1')

# Or by index:
df_for_cleaning = pd.read_excel('your_dataset.xlsx', sheet_name=0)

# For SQL add the following to the list of imports:
from sqlalchemy import create_engine
# Create an engine for the MySQL database
# Replace 'user', 'password', 'host', 'database' with your actual database credentials
engine = create_engine('mysql+pymysql://user:password@host/database')

# Use pandas to execute a SQL query and load the data into a DataFrame
df_for_cleaning = pd.read_sql('SELECT * FROM your_table', con=engine)

## 3. Exploring and Understanding the Data

In [19]:
# Basic info about the dataset
df_for_cleaning.info()

# Checking the top rows of the dataframe
df_for_cleaning.head()

# Checking the ending rows of the dataframe
df_for_cleaning.tail()

# Random Sample of the dataframe
df_for_cleaning.sample()

# Checking the shape of the dataframe
print(df_for_cleaning.shape)

# Getting information about number of columns, their names and data types
df_for_cleaning.info()

# Summary statistics of numerical columns
df_for_cleaning.describe()

# Check for missing values
df_for_cleaning.isnull().sum()

# Check for duplicate rows
df_for_cleaning.duplicated().sum()

# Visualize missing values
sns.heatmap(df_for_cleaning.isnull(), cbar=False, cmap='plasma')

## 4. Handling Missing Data
### Depending on the data we may need to: 
#### Remove missing data

In [None]:
# Remove rows with any missing values
df_for_cleaning = df_for_cleaning.dropna()

# Remove rows with all na values
df_for_cleaning.dropna(how='all')

# Remove rows where na values meet the specified threshold
df_for_cleaning.dropna(thresh=10)

# In this instance that we don’t want to include any row that doesn’t have information in a specific column
df_for_cleaning.dropna(subset=['colum_name'])

# Remove columns with missing values
df_for_cleaning = df_for_cleaning.dropna(axis=1, how='all')

# Remove rows with a certain threshold of missing values
threshold = 0.2
df_for_cleaning = df_for_cleaning[df_for_cleaning.isnull().mean(axis=1) < threshold]

#### Impute missing data i.e. Replace missing values with a calculated value (mean, median, or a specific value).

In [20]:
# Fill missing values with the mean of the column
df_for_cleaning['column_name'] = df_for_cleaning['column_name'].fillna(df_for_cleaning['column_name'].mean())

# Fill missing values with the median of the column
df_for_cleaning['column_name'] = df_for_cleaning['column_name'].fillna(df_for_cleaning['column_name'].median())

# Or:
col_mean = df_for_cleaning.col_name.mean()
df_for_cleaning.duration = df_for_cleaning.col_name.fillna(col_mean)

# Fill missing values with a constant value
df_for_cleaning['column_name'] = df_for_cleaning['column_name'].fillna(0)

# Use forward fill or backward fill
df_for_cleaning['column_name'] = df_for_cleaning['column_name'].fillna(method='ffill')
df_for_cleaning['column_name'] = df_for_cleaning['column_name'].fillna(method='bfill')

NameError: name 'df_for_cleaning' is not defined

## 5. Handling Duplicates

In [None]:
# Check the number of duplicates
df_for_cleaning.duplicated().sum()

# Remove duplicate rows
df_for_cleaning = df_for_cleaning.drop_duplicates()

# Remove duplicates based on specific columns
df_for_cleaning = df_for_cleaning.drop_duplicates(subset=['column1', 'column2'])

## 6. Data Type Conversion
### Ensure that columns are of the correct data type (e.g., converting numerical columns that are stored as strings).
#### Other formatting inconsistency corrections you may need to conduct include:

#### - Unit conversion
#### - Email, phone, and address standardization
#### - Removing punctuation from strings
#### - Using value mapping to address common abbreviations


In [None]:
# Convert a column to numeric
df_for_cleaning['column_name'] = pd.to_numeric(df_for_cleaning['column_name'], errors='coerce')

# Convert a column to datetime
df_for_cleaning['date_column'] = pd.to_datetime(df_for_cleaning['date_column'], errors='coerce')

# Convert a column to categorical
df_for_cleaning['category_column'] = df_for_cleaning['category_column'].astype('category')

## 7. Handling Outliers
### Outliers can either be removed or capped.
#### Identifying outliers:

In [None]:
# Use box plots or z-scores to detect outliers.
# Box plot to visualize outliers
sns.boxplot(x=df_for_cleaning['numerical_column'])

# Using Z-score to identify outliers
df_for_cleaning['z_score'] = zscore(df_for_cleaning['numerical_column'])
outliers = df_for_cleaning[df_for_cleaning['z_score'].abs() > 3]

#### Removing Outliers:

In [None]:
# Remove rows where values are beyond 3 standard deviations from the mean
mean = df_for_cleaning['numerical_column'].mean()
std_dev = df_for_cleaning['numerical_column'].std()

df_cleaned = df_for_cleaning[(df_for_cleaning['numerical_column'] - mean).abs() <= 3 * std_dev]

#### Capping outliers to a specific threshold (e.g., 99th percentile)


In [None]:
upper_limit = df_for_cleaning['numerical_column'].quantile(0.99)
df_for_cleaning['numerical_column'] = df_for_cleaning['numerical_column'].clip(upper=upper_limit)

## 8. Standardizing or Normalizing Data

In [None]:
# Standardizing (z-score normalization)
scaler = StandardScaler()
df_for_cleaning['scaled_column'] = scaler.fit_transform(df_for_cleaning[['numerical_column']])

# Min-max scaling (range 0-1)
min_max_scaler = MinMaxScaler()
df_for_cleaning['normalized_column'] = min_max_scaler.fit_transform(df_for_cleaning[['numerical_column']])

## 9. Checking for Inconsistencies


In [None]:
# Standardizing categorical values (e.g., lowercase, no extra spaces)
df_for_cleaning['category_column'] = df_for_cleaning['category_column'].str.lower().str.strip()

# Replace inconsistent values
df_for_cleaning['category_column'] = df_for_cleaning['category_column'].replace({'old_value': 'new_value'})

## 10. Final Review and Summary

In [17]:
# Check for missing values after cleaning
df_for_cleaning.isnull().sum()

# Verify the data types again
df_for_cleaning.info()

# Visualize distributions of cleaned data
df_for_cleaning.hist(figsize=(12, 8))
plt.show()

## 11. Saving Cleaned Data

In [None]:
# Save the cleaned data to a new CSV file
df_for_cleaning.to_csv('2_cleaned_files/cleaned_dataset.csv', index=False)