In [1]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def data_preprocessing_pipeline(data):
    #Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include=['object']).columns

    #Handle missing values in numeric features
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())

    #Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])

    #Normalize numeric features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[numeric_features])
    data[numeric_features] = scaler.transform(data[numeric_features])

    #Handle missing values in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

    return data

In [3]:
data = pd.read_csv("MOCK_DATA.csv")

print("Original Data:")
print(data)

Original Data:
      id first_name   last_name                             email      gender  \
0    1.0   Wolfgang         NaN            wsteddall0@comsenz.com    Bigender   
1    2.0    Ardelle         NaN             afader1@wikipedia.org      Female   
2    3.0    Shaughn   Prozescky              sprozescky2@blog.com        Male   
3    4.0     Frasco    Korpolak                               NaN        Male   
4    5.0      Penny       Klaes                  pklaes4@about.me  Non-binary   
5    6.0    Gusella         NaN          gbarnsdale5@facebook.com      Female   
6    NaN   Langsdon   MacAlpine         lmacalpine6@google.com.au        Male   
7    8.0     Monroe         NaN           mdelacey7@google.com.hk        Male   
8    9.0      Fidel     Manueli             fmanueli8@tinyurl.com        Male   
9   10.0     Kesley         NaN             kcivitillo9@google.fr      Female   
10  11.0     Corbin      Sidden                csiddena@jigsy.com        Male   
11  12.0  Bat

In [4]:
#Perform data preprocessing
cleaned_data = data_preprocessing_pipeline(data)

print("Preprocessed Data:")
print(cleaned_data)

Preprocessed Data:
          id first_name   last_name                             email  \
0  -1.731564   Wolfgang  Avramovich            wsteddall0@comsenz.com   
1  -1.542666    Ardelle  Avramovich             afader1@wikipedia.org   
2  -1.353768    Shaughn   Prozescky              sprozescky2@blog.com   
3  -1.164871     Frasco    Korpolak             afader1@wikipedia.org   
4  -0.975973      Penny       Klaes                  pklaes4@about.me   
5  -0.787075    Gusella  Avramovich          gbarnsdale5@facebook.com   
6   0.000000   Langsdon   MacAlpine         lmacalpine6@google.com.au   
7  -0.409279     Monroe  Avramovich           mdelacey7@google.com.hk   
8  -0.220381      Fidel     Manueli             fmanueli8@tinyurl.com   
9  -0.031483     Kesley  Avramovich             kcivitillo9@google.fr   
10  0.157415     Corbin      Sidden                csiddena@jigsy.com   
11  0.346313  Bathsheba    Robilart             brobilartb@joomla.org   
12  0.535211       Vyky       Lo

In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

def transform_data(data, n_components=0.95):
    # Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include=['object']).columns

    # Apply One-Hot Encoding to categorical features
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_cats = encoder.fit_transform(data[categorical_features])
    encoded_cat_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_features))

    # Combine numeric and encoded categorical features
    transformed_data = pd.concat([data[numeric_features].reset_index(drop=True), encoded_cat_df], axis=1)

    # Apply PCA for dimensionality reduction (optional)
    pca = PCA(n_components=n_components)  # Retain 95% of variance
    transformed_data_pca = pca.fit_transform(transformed_data)

    return pd.DataFrame(transformed_data_pca)

# Perform data transformation
transformed_data = transform_data(cleaned_data)

# Print transformed data
print("Transformed Data:")
print(transformed_data.head())


Transformed Data:
         0         1         2         3         4         5             6   \
0 -2.011251 -0.396914  0.632020  0.240970  0.663867  1.337801 -4.734861e-17   
1 -1.437522 -1.156223  0.102734  0.812778 -0.309986 -0.305712  2.906354e-17   
2 -1.553773  0.629607 -0.611821 -0.423469 -0.568107  0.281464 -2.627342e-02   
3 -1.234912  0.370920 -1.016359  0.775915 -0.489178 -0.286420  1.105773e-15   
4 -1.089705 -0.190323 -0.742556 -0.757944  1.733860 -0.898008  4.480530e-16   

             7             8             9             10        11        12  \
0 -1.183715e-17  6.013231e-16  1.420458e-16 -4.734861e-17 -0.474776 -0.541715   
1  2.375785e-16  1.348539e-15  7.035126e-16  2.377998e-16 -0.094611  0.150588   
2 -2.458145e-01  8.187286e-01  3.938354e-01  1.189616e-01  1.200541  0.174652   
3 -1.351814e-14 -1.566385e-14  1.063508e-15 -1.163650e-14 -0.214116 -0.679228   
4  3.572715e-16  2.748347e-15  5.640752e-16  3.257370e-16  0.046861 -0.070293   

         13         