# data_preprocess

> This module contains several Python function for simple data preprocessing for ML, such as handling missing values, minmax scaling, and one hot encoding

In [None]:
#| default_exp data_preprocess

In [None]:
#| hide
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export
from that_ml_library.utils import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,StandardScaler

## process_missing_values

In [None]:
#| export
def process_missing_values(X_train:pd.DataFrame, # Training dataframe
                           X_test:pd.DataFrame=None, # Testing dataframe
                           missing_cols:list|str=[], # A column name having missing values, or a list of such columns
                           missing_vals:list|int|float|str=np.NaN, # A placeholder for missing values, or a list of placeholders for all columns in miss_cols
                           strategies:list|str='median', # The imputation strategy from sklearn, or a list of such values. Currently support 'median','mean','most_frequent'
                           **kwargs):
    "Process columns with missing values using Sklearn SimpleInputer"
    if missing_cols==[]:
        return X_train,X_test
    missing_cols = val2list(missing_cols)
    missing_vals = val2list(missing_vals,len(missing_cols))
    strategies = val2list(strategies,len(missing_cols))
    X_train = X_train.copy()
    if X_test is not None: X_test = X_test.copy()
    for i,c in enumerate(missing_cols):
        imp = SimpleImputer(missing_values=missing_vals[i], strategy=strategies[i])
        X_train[c] = imp.fit_transform(X_train[c].values.reshape(-1,1)).flatten()
        if X_test is not None: X_test[c] = imp.transform(X_test[c].values.reshape(-1,1)).flatten()
    return X_train,X_test

In [None]:
show_doc(process_missing_values)

---

[source](https://github.com/anhquan0412/that-ml-library/blob/main/that_ml_library/data_preprocess.py#L18){target="_blank" style="float:right; font-size:smaller"}

### process_missing_values

>      process_missing_values (X_train:pandas.core.frame.DataFrame,
>                              X_test:pandas.core.frame.DataFrame=None,
>                              missing_cols:list|str=[],
>                              missing_vals:list|int|float|str=nan,
>                              strategies:list|str='median', **kwargs)

Process columns with missing values using Sklearn SimpleInputer

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X_train | DataFrame |  | Training dataframe |
| X_test | DataFrame | None | Testing dataframe |
| missing_cols | list \| str | [] | A column name having missing values, or a list of such columns |
| missing_vals | list \| int \| float \| str | nan | A placeholder for missing values, or a list of placeholders for all columns in miss_cols |
| strategies | list \| str | median | The imputation strategy from sklearn, or a list of such values. Currently support 'median','mean','most_frequent' |
| kwargs |  |  |  |

In [None]:
df = pd.DataFrame([[7, 2, 3], [4, np.nan, 6], [10, 5, -1]],columns=['col1','col2','col3'])
display(df)
print()
df_processed,_ = process_missing_values(df,missing_cols=['col2','col3'],missing_vals=[np.NaN,-1],strategy='mean')
display(df_processed)

Unnamed: 0,col1,col2,col3
0,7,2.0,3
1,4,,6
2,10,5.0,-1





Unnamed: 0,col1,col2,col3
0,7,2.0,3.0
1,4,3.5,6.0
2,10,5.0,4.5


In [None]:
df_trn = pd.DataFrame([[7, 2, 3], [4, np.nan, 6], [10, 5, -1]],columns=['col1','col2','col3'])
df_test = pd.DataFrame([[2, np.NaN, 3], [3, 1, -1]],columns=['col1','col2','col3'])
display(df_trn,df_test)
print()
df_processed_trn,df_procesed_val= process_missing_values(df_trn,
                                                         df_test,
                                                         missing_cols=['col2','col3'],
                                                         missing_vals=[np.NaN,-1],strategy='mean')
display(df_processed_trn,df_procesed_val)

Unnamed: 0,col1,col2,col3
0,7,2.0,3
1,4,,6
2,10,5.0,-1


Unnamed: 0,col1,col2,col3
0,2,,3
1,3,1.0,-1





Unnamed: 0,col1,col2,col3
0,7,2.0,3.0
1,4,3.5,6.0
2,10,5.0,4.5


Unnamed: 0,col1,col2,col3
0,2,3.5,3.0
1,3,1.0,4.5


## scale_num_cols

In [None]:
#| export
def scale_num_cols(X_train:pd.DataFrame, # Training dataframe
                   X_test:pd.DataFrame=None, # Testing dataframe
                   num_cols:list|str=[], # Name of the numerical column, or a list of such columns
                   scale_methods:list|str='minmax', # Sklearn scaling method ('minmax' or 'standard'), or a list of such methods        
                    **kwargs):
    "Scale numerical columns using Sklearn"
    if num_cols==[]:
        return X_train,X_test
    num_cols = val2list(num_cols)
    scale_methods = val2list(scale_methods,len(num_cols))
    X_train = X_train.copy()
    if X_test is not None: X_test = X_test.copy()
    for i,c in enumerate(num_cols):
        if scale_methods[i]=='minmax':
            imp = MinMaxScaler()
        elif scale_methods[i]=='standard':
            imp = StandardScaler()
        else:
            raise ValueError('Unrecognized scaling method. Accept methods: minmax and standard')
        X_train[c] = imp.fit_transform(X_train[c].values.reshape(-1,1)).flatten()
        if X_test is not None: X_test[c] = imp.transform(X_test[c].values.reshape(-1,1)).flatten()
    return X_train,X_test

In [None]:
show_doc(scale_num_cols)

---

[source](https://github.com/anhquan0412/that-ml-library/blob/main/that_ml_library/data_preprocess.py#L39){target="_blank" style="float:right; font-size:smaller"}

### scale_num_cols

>      scale_num_cols (X_train:pandas.core.frame.DataFrame,
>                      X_test:pandas.core.frame.DataFrame=None,
>                      num_cols:list|str=[], scale_methods:list|str='minmax',
>                      **kwargs)

Scale numerical columns using Sklearn

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X_train | DataFrame |  | Training dataframe |
| X_test | DataFrame | None | Testing dataframe |
| num_cols | list \| str | [] | Name of the numerical column, or a list of such columns |
| scale_methods | list \| str | minmax | Sklearn scaling method ('minmax' or 'standard'), or a list of such methods |
| kwargs |  |  |  |

In [None]:
df = pd.DataFrame([[7, 2, 3], [4, 2, 6], [10, 5, 1]],columns=['col1','col2','col3'])
display(df)
print()
df_processed,_ = scale_num_cols(df,num_cols=['col1','col3'],scale_methods='standard')
display(df_processed)


Unnamed: 0,col1,col2,col3
0,7,2,3
1,4,2,6
2,10,5,1





Unnamed: 0,col1,col2,col3
0,0.0,2,-0.162221
1,-1.224745,2,1.297771
2,1.224745,5,-1.13555


## one_hot_cat

In [None]:
#| export
def one_hot_cat(X_train:pd.DataFrame, # Training dataframe
                X_test:pd.DataFrame=None, # Testing dataframe
                cat_cols:list|str=[], # Name of the categorical columns (non-binary), or a list of such columns
                bi_cols:list|str=[], # Name of the binary column, or a list of such columns
                **kwargs):
    "Perform 'get_dummies' on categorical columns"
    if cat_cols==[] and bi_cols==[]:
        return X_train,X_test
    cat_cols = val2list(cat_cols)
    bi_cols = val2list(bi_cols)
    n_train = X_train.shape[0]
    if X_test is not None:
        X_total = pd.concat([X_train,X_test],axis=0)
    else:
        X_total = X_train.copy()
    if len(cat_cols):
        X_total = pd.get_dummies(X_total,columns=cat_cols,drop_first=False)
    if len(bi_cols):
        X_total = pd.get_dummies(X_total,columns=bi_cols,drop_first=True)
    return X_total.iloc[:n_train].copy(), X_total.iloc[n_train:].copy() if X_test is not None else None

In [None]:
show_doc(one_hot_cat)

---

[source](https://github.com/anhquan0412/that-ml-library/blob/main/that_ml_library/data_preprocess.py#L63){target="_blank" style="float:right; font-size:smaller"}

### one_hot_cat

>      one_hot_cat (X_train:pandas.core.frame.DataFrame,
>                   X_test:pandas.core.frame.DataFrame=None,
>                   cat_cols:list|str=[], bi_cols:list|str=[], **kwargs)

Perform 'get_dummies' on categorical columns

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X_train | DataFrame |  | Training dataframe |
| X_test | DataFrame | None | Testing dataframe |
| cat_cols | list \| str | [] | Name of the categorical columns (non-binary), or a list of such columns |
| bi_cols | list \| str | [] | Name of the binary column, or a list of such columns |
| kwargs |  |  |  |

In [None]:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
                   'C': [1, 2, 3]})
display(df)
print()
df_processed,_ = one_hot_cat(df,cat_cols='B',bi_cols='A')
display(df_processed)

Unnamed: 0,A,B,C
0,a,b,1
1,b,a,2
2,a,c,3





Unnamed: 0,C,B_a,B_b,B_c,A_b
0,1,False,True,False,False
1,2,True,False,False,True
2,3,False,False,True,False


## processing_general

In [None]:
#| export
def preprocessing_general(X_train:pd.DataFrame, # Training dataframe
                          X_test:pd.DataFrame=None, # Testing dataframe
                          **kwargs, # Keyword arguments for processing missing values, scaling numerical columns and one-hot-encoding categorical columns
                         ):
    """
    The main preprocessing functions, will perform:
    - Fill missing values
    - Scale numerical columns
    - One-hot encode categorical columns
    
    Remember to put in the appropriate keyword arguments for each of the preprocessings mentioned above
    """
    X_train,X_test = process_missing_values(X_train,X_test,**kwargs)
    X_train,X_test = scale_num_cols(X_train,X_test,**kwargs)
    X_train,X_test = one_hot_cat(X_train,X_test,**kwargs)
    return X_train,X_test

In [None]:
show_doc(preprocessing_general)

---

[source](https://github.com/anhquan0412/that-ml-library/blob/main/that_ml_library/data_preprocess.py#L85){target="_blank" style="float:right; font-size:smaller"}

### preprocessing_general

>      preprocessing_general (X_train:pandas.core.frame.DataFrame,
>                             X_test:pandas.core.frame.DataFrame=None, **kwargs)

The main preprocessing functions, will perform:
- Fill missing values
- Scale numerical columns
- One-hot encode categorical columns

Remember to put in the appropriate keyword arguments for each of the preprocessings mentioned above

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X_train | DataFrame |  | Training dataframe |
| X_test | DataFrame | None | Testing dataframe |
| kwargs |  |  |  |

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/titanic.csv')

In [None]:
# Select some useful features, for now
df = df[['Survived','Pclass','Sex','Age','SibSp','Parch','Embarked']].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  156 non-null    int64  
 1   Pclass    156 non-null    int64  
 2   Sex       156 non-null    object 
 3   Age       126 non-null    float64
 4   SibSp     156 non-null    int64  
 5   Parch     156 non-null    int64  
 6   Embarked  155 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 8.7+ KB


In [None]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
86,0,3,male,16.0,1,3,S
31,1,1,female,,1,0,C
105,0,3,male,28.0,0,0,S
96,0,1,male,71.0,0,0,C
112,0,3,male,22.0,0,0,S


Let's perform a simple train/test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived',axis=1), df['Survived'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['Survived'])

In [None]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
142,3,female,24.0,1,0,S
134,2,male,25.0,0,0,S
120,2,male,21.0,2,0,S
50,3,male,7.0,4,1,S
133,2,female,29.0,1,0,S


In [None]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
91,3,male,20.0,0,0,S
145,2,male,19.0,1,1,S
115,3,male,21.0,0,0,S
106,3,female,21.0,0,0,S
9,2,female,14.0,1,0,C


In [None]:
X_train.Parch.value_counts()

Parch
0    95
2    14
1    12
5     2
3     1
Name: count, dtype: int64

In [None]:
X_train_processed,X_test_processed = preprocessing_general(X_train,X_test,
                                                           missing_cols=['Age','Embarked'],
                                                           missing_vals=np.NaN,
                                                           strategies=['median','most_frequent'],
                                                           num_cols=['Age','SibSp','Parch'],
                                                           scale_methods=['standard','minmax','minmax'],
                                                           cat_cols='Embarked',
                                                           bi_cols='Sex'
                                                          )

Notice that I don't add ```Pclass``` to the preprocessing function. That means this column will be left untouched

In [None]:
X_train_processed.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S,Sex_male
142,3,-0.325526,0.2,0.0,False,False,True,False
134,2,-0.252796,0.0,0.0,False,False,True,True
120,2,-0.543716,0.4,0.0,False,False,True,True
50,3,-1.561938,0.8,0.2,False,False,True,True
133,2,0.038125,0.2,0.0,False,False,True,False


In [None]:
X_test_processed.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S,Sex_male
91,3,-0.616446,0.0,0.0,False,False,True,True
145,2,-0.689176,0.2,0.2,False,False,True,True
115,3,-0.543716,0.0,0.0,False,False,True,True
106,3,-0.543716,0.0,0.0,False,False,True,False
9,2,-1.052827,0.2,0.0,True,False,False,False


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()