# Proyecto # 1 - 2a Parte
## Product Development - Ing. Preng Biba
### Hugo Brian Bay Rojas - Carnet 20002544

## 5. Machine Learning Pipeline

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.special import inv_boxcox
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.transformation import BoxCoxTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [2]:
import my_preprocessors as mypp

In [3]:
data = pd.read_csv("Train.csv")
data.head()

Unnamed: 0,ID,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,288761,m,1988-09-15,92.0,kerala state board,2007,80.0,kerala state board,1275,2,...,366,-1,-1,-1,0.8192,-0.2793,-0.1988,-0.6428,0.0284,95000
1,1201536,m,1991-07-14,89.9,state board,2009,84.5,state board,8524,2,...,-1,-1,-1,-1,1.2772,0.8784,0.4711,-0.7415,0.4805,360000
2,496538,f,1990-08-16,71.0,cbse,2007,70.0,cbse,58,2,...,-1,-1,-1,-1,-1.6924,1.1248,-1.2148,1.0611,-0.1295,265000
3,353442,m,1990-09-14,91.0,karnataka state board,2008,84.33,"pre university board, karnataka",855,2,...,-1,-1,-1,-1,-0.6491,0.3448,0.0914,-0.349,-1.5513,565000
4,473825,m,1988-02-05,78.0,0,2005,67.0,0,8946,2,...,-1,-1,-1,-1,0.1623,1.4368,1.688,-0.6134,0.6603,660000


In [4]:
#Cast de Variables categoricas con diferentes tipos de dato
data['CollegeID'] = data['CollegeID'].astype('O')

In [5]:
#separamos data para entrenamiento y prueba,
X_train, X_test, y_train, y_test = train_test_split(
                        data.drop(['ID', 'CollegeCityID', 'Salary'], axis=1),
                        data['Salary'],
                        test_size=0.3,
                        random_state=2021)

X_train.shape, X_test.shape

((1993, 31), (855, 31))

In [6]:
# Transformación del Target
y_train, param_y = stats.boxcox(y_train)
y_train = pd.Series(y_train).rename('Salary')
y_test = stats.boxcox(y_test, lmbda = param_y)
y_test = pd.Series(y_test).rename('Salary')

In [19]:
joblib.dump(param_y, 'boxcox_y.joblib')

['boxcox_y.joblib']

### 5.1. Configuración del ML Pipeline

In [7]:
#Variables categóricas
CATEGORICAL_VARS = ['10board', '12board', 'Specialization', 'CollegeState', 'CollegeID', 'Gender', 'Degree']

#Variables numéricas
NUMERICAL_VARS = ['10percentage',
 '12percentage',
 'CollegeTier',
 'collegeGPA',
 'CollegeCityTier',
 'English',
 'Logical',
 'Quant',
 'Domain',
 'ComputerProgramming',
 'ElectronicsAndSemicon',
 'ComputerScience',
 'MechanicalEngg',
 'ElectricalEngg',
 'TelecomEngg',
 'CivilEngg',
 'conscientiousness',
 'agreeableness',
 'extraversion',
 'nueroticism',
 'openess_to_experience']

#Variables con etiquetas raras
RARE_LABEL_VARS = ['10board', '12board', 'Specialization', 'CollegeState', 'CollegeID']

#Variables para binarizar por sesgo fuerte
BINARIZE_VARS = ['ElectronicsAndSemicon', 'MechanicalEngg', 'ElectricalEngg', 'TelecomEngg']

#Variables para aplicar transformación Yeo Johnson
YEOJOHNSON_VARS = ['10percentage', '12percentage', 'collegeGPA', 'English', 'Logical', 'Quant', 'Domain', 'conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience']

#Variables de temporalidad
TEMPORAL_VARS = ['12graduation', 'GraduationYear']

REF_VAR = 'DOB'

#Variable a eliminar
DROP_FEATURES = ['DOB']

#Variables a utilizar en el modelo
FEATURES = ['Gender',
 'DOB',
 '10percentage',
 '10board',
 '12graduation',
 '12percentage',
 '12board',
 'CollegeID',
 'CollegeTier',
 'Degree',
 'Specialization',
 'collegeGPA',
 'CollegeCityTier',
 'CollegeState',
 'GraduationYear',
 'English',
 'Logical',
 'Quant',
 'Domain',
 'ComputerProgramming',
 'ElectronicsAndSemicon',
 'ComputerScience',
 'MechanicalEngg',
 'ElectricalEngg',
 'TelecomEngg',
 'CivilEngg',
 'conscientiousness',
 'agreeableness',
 'extraversion',
 'nueroticism',
 'openess_to_experience']

In [8]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_train.shape

(1993, 31)

In [13]:
EngineeringSalary_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #2. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS)),
    
    #3. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS)
    ),
    
    #============= VARIABLES TEMPORALES ==================
    
    #4. Tratamiento de variables temporales
    ('eslapsed_time', mypp.TemporalVariableTransformer(
        variables=TEMPORAL_VARS, reference_variable=REF_VAR)
    ),
    
    #5. Drop de variables
    ('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #6. Transformación Yeo Johnson
    ('YJ_transformation', YeoJohnsonTransformer(variables=YEOJOHNSON_VARS)),
    
    #7. Binarización de Variables con Sesgo Fuerte
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=-1), variables=BINARIZE_VARS)
    ),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    #8. Codificación de etiquetas raras
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=RARE_LABEL_VARS)),
    
    #9. Codificación de variables categóricas por ordinal encoding
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('LM_Model', LinearRegression()),
])

In [14]:
EngineeringSalary_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_indicator',
                 AddMissingIndicator(variables=['10percentage', '12percentage',
                                                'CollegeTier', 'collegeGPA',
                                                'CollegeCityTier', 'English',
                                                'Logical', 'Quant', 'Domain',
                                                'ComputerProgramming',
                                                'ElectronicsAndSemicon',
                                                'ComputerScience',
                                                'MechanicalEngg',
                                                'ElectricalEngg', 'TelecomEngg',
                                                'CivilEngg',
                                                'conscientiousness',
                                                'agreeableness', 'extravers...
                                                      'TelecomEngg'])),
          

In [15]:
preds = EngineeringSalary_pipeline.predict(X_test)
print("MSE de Entrenamiento: {}".format(mean_squared_error(inv_boxcox(y_test, param_y), inv_boxcox(preds, param_y))))
print("RMSE de Entrenamiento: {}".format(mean_squared_error(inv_boxcox(y_test, param_y), inv_boxcox(preds, param_y), squared=False)))
print("R2 de Entrenamiento: {}".format(r2_score(inv_boxcox(y_test, param_y), inv_boxcox(preds, param_y))))

MSE de Entrenamiento: 43889287766.342575
RMSE de Entrenamiento: 209497.70348703724
R2 de Entrenamiento: 0.1183675824258772


In [16]:
#Cargamos dataset test.csv para prueba.
Test = pd.read_csv("Test.csv")
Test = Test[FEATURES]
Test['CollegeID'] = Test['CollegeID'].astype('O')

In [18]:
#Realizamos predicciones sobre el dataset de prueba
preds_test = EngineeringSalary_pipeline.predict(Test)
inv_boxcox(preds_test, param_y)

array([182620.78250684, 353820.17510012, 179800.26652588, 258345.09233134,
       237543.81628912, 312686.69173246, 194718.31524693, 292750.79162201,
       187607.59090612, 328106.91364715, 230079.77074409, 180871.48201819,
       299387.24883521, 264719.34154315, 246375.86512559, 250203.86396676,
       273384.674657  , 228832.04342434, 205105.62684689, 276244.7204982 ,
       230318.38914277, 485619.32634569, 186748.47885818, 180060.37272321,
       198476.7865599 , 172337.78875646, 273105.58594738, 432406.45146703,
       173240.18450559, 178311.99593328, 306373.13639565, 240271.34146113,
       236610.77563613, 173103.03999336, 244081.60616915, 211480.57332247,
       324195.56774409, 413778.46578752, 176925.60837437, 337668.48151239,
       242739.81370715, 259438.00765277, 183542.34373435, 234817.78708427,
       208033.44720339, 373571.07960903, 306344.87233108, 299587.91249524,
       228815.70584542, 413170.35235226, 256082.73956863, 312625.65576897,
       219059.23839914, 1

In [21]:
#Guardamos pipeline
joblib.dump(EngineeringSalary_pipeline, 'EngineeringSalary_pipeline.pkl')

['EngineeringSalary_pipeline.pkl']