# Laboratorio 4 - Model Selection
## Product Development - Ing. Preng Biba
### Alumno: Hugo Brian Bay Rojas - Carnet 20002544

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

#Metricas
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import scipy.stats as stats

import joblib

In [2]:
X_train = pd.read_csv("preprocessed_data/prep_Xtrain.csv")
X_test = pd.read_csv("preprocessed_data/prep_Xtest.csv")

In [3]:
y_train = pd.read_csv("preprocessed_data/prep_ytrain.csv")
y_test = pd.read_csv("preprocessed_data/prep_ytest.csv")

In [38]:
y_train = y_train.iloc[:, 0]
y_test = y_test.iloc[:,0]

## Entrenamiento de modelos

### 1. Regresión logística

In [39]:
logit = LogisticRegression().fit(X_train, y_train)
predicts_log = logit.predict(X_test)
auc = roc_auc_score(y_test, predicts_log)
acc = round(accuracy_score(y_test, predicts_log), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 1.0 ACC: 1.0


### 2. Naive Bayes

In [40]:
nb = GaussianNB().fit(X_train, y_train)
predicts_nb = nb.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_nb), 4)
acc = round(accuracy_score(y_test, predicts_nb), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc)) 

ROC_AUC: 1.0 ACC: 1.0


### 3. KNN

In [41]:
clf_knn = GridSearchCV(KNeighborsClassifier(), {'n_neighbors':[2,5,7,11,13]}, cv=10, scoring='roc_auc').fit(X_train, y_train)
KNNResults = pd.DataFrame(clf_knn.cv_results_)
KNNResults.sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
0,1,{'n_neighbors': 2},1.0
1,1,{'n_neighbors': 5},1.0
2,1,{'n_neighbors': 7},1.0
3,1,{'n_neighbors': 11},1.0
4,1,{'n_neighbors': 13},1.0


In [46]:
knn = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train)
predicts_knn = knn.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_knn), 4)
acc = round(accuracy_score(y_test, predicts_knn), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 1.0 ACC: 1.0


### 4. SVM

In [43]:
clf_svm = GridSearchCV(SVC(kernel='linear'), {'C':[0.5, 1.0, 1.5, 3.0]}, cv=10, scoring='roc_auc').fit(X_train, y_train)
SVMResults = pd.DataFrame(clf_svm.cv_results_)
SVMResults.sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
0,1,{'C': 0.5},1.0
1,1,{'C': 1.0},1.0
2,1,{'C': 1.5},1.0
3,1,{'C': 3.0},1.0


In [48]:
svm = SVC(kernel='linear', C=0.5).fit(X_train, y_train)
predicts_svm = svm.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_svm), 4)
acc = round(accuracy_score(y_test, predicts_svm), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 1.0 ACC: 1.0


### 5. LDA

In [49]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True).fit(X_train, y_train)
predicts_lda = lda.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_lda), 4)
acc = round(accuracy_score(y_test, predicts_lda), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 0.7127 ACC: 0.7388


### Random Forest

In [51]:
clf_rf = GridSearchCV(RandomForestClassifier(), {'n_estimators':[10, 50, 100, 500, 800, 1000]}, cv=10, scoring='roc_auc').fit(X_train, y_train)
RFResults = pd.DataFrame(clf_rf.cv_results_)
RFResults.sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
0,1,{'n_estimators': 10},1.0
1,1,{'n_estimators': 50},1.0
2,1,{'n_estimators': 100},1.0
3,1,{'n_estimators': 500},1.0
4,1,{'n_estimators': 800},1.0
5,1,{'n_estimators': 1000},1.0


In [52]:
rf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
predicts_rf = rf.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_rf), 4)
acc = round(accuracy_score(y_test, predicts_rf), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 1.0 ACC: 1.0


## Modelo Final

Por ser un modelo menos complejo y tener un resultado exclente, se selecciona el modelo de Regresión Logística.

In [53]:
joblib.dump(logit, 'final_model/logit_titanic.joblib')

['final_model/logit_titanic.joblib']