# Data Analysis & Preprocessing

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## Load and Describe Data

In [3]:

def load_and_describe_data(file_name):
    data = pd.read_csv(file_name)
    print(data.describe())
    return data


## Handle Missing Data

In [14]:

def handle_missing_data(data):
    for col in data.columns:
        if data[col].dtype in [np.float64, np.int64]:
            data[col].fillna(data[col].mean(), inplace=True)
        else:
            data[col].fillna(data[col].mode()[0], inplace=True)
    return data



## Standardize Strings

In [15]:
def standardize_strings(data):
    if 'species' in data.columns:
        data['species'] = data['species'].str.capitalize()
    return data



## Handle Outliers

In [6]:
def handle_outliers(data):
    numeric_cols = data.select_dtypes(include=[np.float64, np.int64]).columns
    Q1 = data[numeric_cols].quantile(0.25)
    Q3 = data[numeric_cols].quantile(0.75)
    IQR = Q3 - Q1
    data = data[~((data[numeric_cols] < (Q1 - 1.5 * IQR)) | (data[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return data



## Data Visualization

In [16]:
def visualize_data(data, dataset_name):
    # 1. Pairplot for numeric features only
    numeric_data = data.select_dtypes(include=[np.float64, np.int64])
    sns.pairplot(numeric_data)
    plt.savefig(f"{dataset_name}_pairplot.png")
    plt.close()

    # 2. Correlation matrix heatmap
    correlation_matrix = numeric_data.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True)
    plt.savefig(f"{dataset_name}_correlation_matrix.png")
    plt.close()

    # 3. Boxplots for numeric features
    for col in numeric_data.columns:
        plt.figure(figsize=(6, 4))
        sns.boxplot(data[col])
        plt.title(f"Boxplot of {col}")
        plt.savefig(f"{dataset_name}_{col}_boxplot.png")
        plt.close()

    # 4. Count plots for primary categorical feature
    categorical_data = data.select_dtypes(exclude=[np.float64, np.int64])
    if not categorical_data.empty:
        primary_categorical = categorical_data.columns[0]
        plt.figure(figsize=(6, 4))
        sns.countplot(x=primary_categorical, data=data)
        plt.title(f"Count plot of {primary_categorical}")
        plt.savefig(f"{dataset_name}_{primary_categorical}_countplot.png")
        plt.close()
    
    # 5. Distribution plots for two major numeric features (if available)
    for col in numeric_data.columns[:2]:
        plt.figure(figsize=(6, 4))
        sns.histplot(data[col], kde=True)
        plt.title(f"Distribution of {col}")
        plt.savefig(f"{dataset_name}_{col}_distribution.png")
        plt.close()
        
    # 6. Bar plot for a categorical vs numeric feature (if available)
    if not categorical_data.empty and not numeric_data.empty:
        plt.figure(figsize=(8, 6))
        sns.barplot(x=categorical_data.columns[0], y=numeric_data.columns[0], data=data)
        plt.title(f"{categorical_data.columns[0]} vs. {numeric_data.columns[0]}")
        plt.savefig(f"{dataset_name}_barplot.png")
        plt.close()
    



## Handle Uncorrelated Features

In [17]:
def drop_uncorrelated_features(data, threshold=0.1):
    # Extract only numeric columns for correlation calculation
    numeric_data = data.select_dtypes(include=[np.float64, np.int64])
    correlation_matrix = numeric_data.corr()

    # Identify columns with low correlation
    cols_to_drop = []
    for column in correlation_matrix.columns:
        if all(correlation_matrix[column].between(-threshold, threshold)):
            cols_to_drop.append(column)

    # Drop identified columns from the original data
    data = data.drop(columns=cols_to_drop)
    
    print(f"Dropped columns: {cols_to_drop}")
    return data


## Convert to Categorical

In [18]:

def convert_to_categorical(data):
    data = pd.get_dummies(data, drop_first=True)
    return data



## Normalize Data

In [19]:

def normalize_data(data):
    for col in data.columns:
        if data[col].dtype in [np.float64, np.int64]:
            min_val = data[col].min()
            max_val = data[col].max()
            data[col] = (data[col] - min_val) / (max_val - min_val)
    return data



## Main Execution

In [20]:
def process_penguins_data():
    print("Processing penguins.csv...\n")
    penguins_data = load_and_describe_data('penguins.csv')
    penguins_data = handle_missing_data(penguins_data)
    penguins_data = standardize_strings(penguins_data)
    penguins_data = handle_outliers(penguins_data)
    visualize_data(penguins_data, "penguins")
    penguins_data = drop_uncorrelated_features(penguins_data)
    penguins_data = normalize_data(penguins_data)
    return penguins_data

def process_diamond_data():
    global diamond_data  # <-- Add this line to make diamond_data a global variable
    print("\nProcessing diamond.csv...\n")
    diamond_data = load_and_describe_data('diamond.csv')
    diamond_data = handle_missing_data(diamond_data)
    diamond_data = handle_outliers(diamond_data)
    visualize_data(diamond_data, "diamond")
    diamond_data = drop_uncorrelated_features(diamond_data)
    diamond_data = normalize_data(diamond_data)
    return diamond_data


def main():
    penguins_data = process_penguins_data()
    diamond_data = process_diamond_data()

if __name__ == "__main__":
    main()


Processing penguins.csv...

       calorie requirement  average sleep duration  bill_length_mm  \
count           344.000000              344.000000      337.000000   
mean           5270.002907               10.447674       45.494214   
std            1067.959116                2.265895       10.815787   
min            3504.000000                7.000000       32.100000   
25%            4403.000000                9.000000       39.500000   
50%            5106.500000               10.000000       45.100000   
75%            6212.750000               12.000000       49.000000   
max            7197.000000               14.000000      124.300000   

       bill_depth_mm  flipper_length_mm  body_mass_g         year  
count     333.000000         336.000000   339.000000   342.000000  
mean       18.018318         197.764881  4175.463127  2008.035088  
std         9.241384          27.764491   858.713267     0.816938  
min        13.100000          10.000000   882.000000  2007.000000  
2

  self._figure.tight_layout(*args, **kwargs)


Dropped columns: []

Processing diamond.csv...

       average us salary  number of diamonds mined (millions)
count       53940.000000                         53940.000000
mean        39521.990100                             2.902669
std          5486.892971                             1.325985
min         30000.000000                             0.600000
25%         34780.000000                             1.750000
50%         39547.500000                             2.910000
75%         44252.000000                             4.050000
max         48999.000000                             5.200000


  self._figure.tight_layout(*args, **kwargs)


Dropped columns: []
