# Model Performance Transformations

Lets practice some basic data transformation for ML performance enhancement

In [None]:
# Imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
# Categorical data analyser

def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

## Scaling

Some ML algorithms have problems performing well whenever the data scale differ greatly between features. In those cases scaling the data is your best option.

- [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)

- [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)

Try both options and see what happens with performance (i.e.: AUC).

<img src="../images/scaling.png" alt="Drawing" style="width: 500px;"/>

In [None]:
# Weather dataset (https://www.kaggle.com/jsphyg/weather-dataset-rattle-package)

weather = pd.read_csv('../data/weatherAUS.csv')
print(weather.shape)
weather.head()

In [None]:
# Uluru weather (numerical features)

weather = weather[weather['Location'].isin(['Uluru'])].reset_index(drop=True)
weather = weather[weather['RainToday'].isin(['No','Yes'])].reset_index(drop=True)
weather = weather[weather['RainTomorrow'].isin(['No','Yes'])]
weather = weather[['MinTemp',
                   'MaxTemp',
                   'Rainfall',
                   'WindSpeed9am',
                   'WindSpeed3pm',
                   'Humidity9am',
                   'Humidity3pm',
                   'Pressure9am',
                   'Pressure3pm',
                   'Temp9am',
                   'Temp3pm',
                   'RainTomorrow']]
weather = weather.dropna().reset_index(drop=True)
col_weather = list(weather.columns)
print(col_weather)
print(weather.shape)
print(weather.describe())
weather.head()

In [None]:
# Features + target

X = weather[['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm']]
y = pd.get_dummies(weather['RainTomorrow'], drop_first=True)['Yes']
print(X.shape,y.shape)

In [None]:
# Train + test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

# Scaling - RobustScaler

In [None]:
scaler = RobustScaler()
scaled_data_rs = scaler.fit_transform(X_train)
scaled_data_rs

In [None]:
#X_train = scaled_data_rs
X_train

In [None]:
scaled_data_test = scaler.transform(X_test)
scaled_data_test

In [None]:
#X_test = scaled_data_test
X_test

In [None]:
# Linear model - Sin escalar 

linear_model = LogisticRegression(max_iter=1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")

In [None]:
# Ensemble model

ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC is: {ensemble_auc}")

---

# Scaling - StandardScalerc

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X_train)
X_train = scaled_data
X_train

In [None]:
x_test= scaler.transform(X_test) 
X_test = x_test

In [None]:
# Linear model StandardScalerc

linear_model = LogisticRegression(max_iter=1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")

In [None]:
# Ensemble model StandardScalerc

ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC is: {ensemble_auc}")

## Enconding

ML algorithms do not support categorical data. Therefore you need to find a way to transform categorical data into numerical. You must compare the results using both techniques: __One Hot Encoding__ or __Label Encoding__

- [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)

- [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)

<img src="../images/encoding.png" alt="Drawing" style="width: 500px;"/>

In [None]:
# Mushrooms dataset (https://www.kaggle.com/uciml/mushroom-classification)

mushrooms = pd.read_csv('../data/mushrooms.csv')
col_mushrooms = list(mushrooms.columns)
print(mushrooms.shape)
mushrooms.head()

In [None]:
# Features analysis

cat_mushrooms = cat_var(mushrooms, col_mushrooms)
cat_mushrooms

In [None]:
cat_cols = ['cap-shape',
               'cap-surface',
               'cap-color',
               'bruises',
               'odor',
               'gill-attachment',
               'gill-spacing',
               'gill-size',
               'gill-color',
               'stalk-surface-below-ring',
               'stalk-color-above-ring',
               'stalk-color-below-ring',
               'veil-type',
               'veil-color',
               'ring-number',
               'ring-type',
               'spore-print-color',
               'population', 
               'habitat']
mushrooms_encoded = mushrooms[cat_cols]
mushrooms_encoded

In [None]:
def ordinal_encoding(x):
    if x == '?':
        return 27
    return ord(x) - 96

In [None]:
# para evitar el error hago un copy
mushrooms_encoded = mushrooms[cat_cols].copy()

for col in cat_cols:
    mushrooms_encoded[col] = mushrooms_encoded[col].apply(ordinal_encoding)

mushrooms_encoded

In [None]:
mushrooms_encoded.info()

In [None]:
mushrooms_check = cat_var(mushrooms_encoded, cat_cols)
mushrooms_check

In [None]:
#FEATURES
X = mushrooms_encoded[['cap-shape',
               'cap-surface',
               'cap-color',
               'bruises',
               'odor',
               'gill-attachment',
               'gill-spacing',
               'gill-size',
               'gill-color',
               'stalk-surface-below-ring',
               'stalk-color-above-ring',
               'stalk-color-below-ring',
               'veil-type',
               'veil-color',
               'ring-number',
               'ring-type',
               'spore-print-color',
               'population', 
               'habitat']]
#TARGET
y = pd.get_dummies(mushrooms['class'], 
                   drop_first=True)
print(X.shape,y.shape)

In [None]:
# Train + test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

In [None]:
# Scaling

scaler = RobustScaler()
scaled_data_mushrooms_lab = scaler.fit_transform(X_train)
scaled_data_mushrooms_lab

In [None]:
# Linear model

linear_model = LogisticRegression(max_iter=1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")

In [None]:
# Ensemble model

ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC is: {ensemble_auc}")

---

In [None]:
#ONE HOT ENCODING

categorical_cols = ['cap-shape',
               'cap-surface',
               'cap-color',
               'bruises',
               'odor',
               'gill-attachment',
               'gill-spacing',
               'gill-size',
               'gill-color',
               'stalk-surface-below-ring',
               'stalk-color-above-ring',
               'stalk-color-below-ring',
               'veil-type',
               'veil-color',
               'ring-number',
               'ring-type',
               'spore-print-color',
               'population', 
               'habitat']

In [None]:
mushrooms_one_hot_encoding = pd.get_dummies(mushrooms[categorical_cols],
                                            columns=['cap-shape',
               'cap-surface',
               'cap-color',
               'bruises',
               'odor',
               'gill-attachment',
               'gill-spacing',
               'gill-size',
               'gill-color',
               'stalk-surface-below-ring',
               'stalk-color-above-ring',
               'stalk-color-below-ring',
               'veil-type',
               'veil-color',
               'ring-number',
               'ring-type',
               'spore-print-color',
               'population', 
               'habitat'], 
                                            drop_first=True)
mushrooms_one_hot_encoding

In [None]:
mushrooms_one_hot_encoding.info()

In [None]:
# Features + target

X = mushrooms_one_hot_encoding[['cap-shape_c',
                                'cap-shape_f',
                                'cap-shape_k',
                                'cap-shape_s',
                                'cap-shape_x',
                                'cap-surface_g',
                                'cap-surface_s',
                                'cap-surface_y',
                                'cap-color_c',
                                'cap-color_e',
                                'cap-color_g',
                                'cap-color_n',
                                'cap-color_p',
                                'cap-color_r',
                                'cap-color_u',
                                'cap-color_w',
                                'cap-color_y',
                                'bruises_t',
                                'odor_c',
                                'odor_f',
                                'odor_l',
                                'odor_m',
                                'odor_n',
                                'odor_p',
                                'odor_s',
                                'odor_y',
                                'gill-attachment_f',
                                'gill-spacing_w',
                                'gill-size_n',
                                'gill-color_e',
                                'gill-color_g',
                                'gill-color_h',
                                'gill-color_k',
                                'gill-color_n',
                                'gill-color_o',
                                'gill-color_p',
                                'gill-color_r',
                                'gill-color_u',
                                'gill-color_w',
                                'gill-color_y',
                                'stalk-surface-below-ring_k',
                                'stalk-surface-below-ring_s',
                                'stalk-surface-below-ring_y',
                                'stalk-color-above-ring_c',
                                'stalk-color-above-ring_e',
                                'stalk-color-above-ring_g',
                                'stalk-color-above-ring_n',
                                'stalk-color-above-ring_o',
                                'stalk-color-above-ring_p',
                                'stalk-color-above-ring_w',
                                'stalk-color-above-ring_y',
                                'stalk-color-below-ring_c',
                                'stalk-color-below-ring_e',
                                'stalk-color-below-ring_g',
                                'stalk-color-below-ring_n',
                                'stalk-color-below-ring_o',
                                'stalk-color-below-ring_p',
                                'stalk-color-below-ring_w',
                                'stalk-color-below-ring_y',
                                'veil-color_o',
                                'veil-color_w',
                                'veil-color_y',
                                'ring-number_o',
                                'ring-number_t',
                                'ring-type_f',
                                'ring-type_l',
                                'ring-type_n',
                                'ring-type_p',
                                'spore-print-color_h',
                                'spore-print-color_k',
                                'spore-print-color_n',
                                'spore-print-color_o',
                                'spore-print-color_r',
                                'spore-print-color_u',
                                'spore-print-color_w',
                                'spore-print-color_y',
                                'population_c',
                                'population_n',
                                'population_s',
                                'population_v',
                                'population_y',
                                'habitat_g',
                                'habitat_l',
                                'habitat_m',
                                'habitat_p',
                                'habitat_u',
                                'habitat_w']]
y = pd.get_dummies(mushrooms['class'], drop_first=True)
print(X.shape,y.shape)

In [None]:
# Linear model

linear_model = LogisticRegression(max_iter=1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")

In [None]:
# Ensemble model

ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC is: {ensemble_auc}")

## Bonus

Now that you can grasp the potential of pre-processing your data...what would you do about the following dataset?

<img src="../images/bonus.jpg" alt="Drawing" style="width: 500px;"/>

In [None]:
# Netflix dataset (https://www.kaggle.com/shivamb/netflix-shows)

netflix = pd.read_csv('../data/netflix_titles.csv')
col_netflix = list(netflix.columns)
print(netflix.shape)
netflix.head()

In [None]:
# ML workflow -> ¿what would you do?










---