# Laboratorio 4 - Model Selection
## Product Development - Ing. Preng Biba
### Alumno: Hugo Brian Bay Rojas - Carnet 20002544

In [54]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

#Metricas
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import scipy.stats as stats

import joblib

In [55]:
X_train = pd.read_csv("preprocessed_data/prep_Xtrain.csv")
X_test = pd.read_csv("preprocessed_data/prep_Xtest.csv")

In [56]:
y_train = pd.read_csv("preprocessed_data/prep_ytrain.csv")
y_test = pd.read_csv("preprocessed_data/prep_ytest.csv")

In [57]:
y_train = y_train.iloc[:, 0]
y_test = y_test.iloc[:,0]

## Entrenamiento de modelos

### 1. Regresión logística

In [58]:
logit = LogisticRegression().fit(X_train, y_train)
predicts_log = logit.predict(X_test)
auc = roc_auc_score(y_test, predicts_log)
acc = round(accuracy_score(y_test, predicts_log), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 0.7127110694183865 ACC: 0.7388


### 2. Naive Bayes

In [59]:
nb = GaussianNB().fit(X_train, y_train)
predicts_nb = nb.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_nb), 4)
acc = round(accuracy_score(y_test, predicts_nb), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc)) 

ROC_AUC: 0.6841 ACC: 0.6866


### 3. KNN

In [60]:
clf_knn = GridSearchCV(KNeighborsClassifier(), {'n_neighbors':[2,5,7,11,13]}, cv=10, scoring='roc_auc').fit(X_train, y_train)
KNNResults = pd.DataFrame(clf_knn.cv_results_)
KNNResults.sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
4,1,{'n_neighbors': 13},0.87158
3,2,{'n_neighbors': 11},0.869343
2,3,{'n_neighbors': 7},0.864149
1,4,{'n_neighbors': 5},0.855953
0,5,{'n_neighbors': 2},0.814469


In [61]:
knn = KNeighborsClassifier(n_neighbors=13).fit(X_train, y_train)
predicts_knn = knn.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_knn), 4)
acc = round(accuracy_score(y_test, predicts_knn), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 0.69 ACC: 0.7239


### 4. SVM

In [62]:
clf_svm = GridSearchCV(SVC(kernel='linear'), {'C':[0.5, 1.0, 1.5, 3.0]}, cv=10, scoring='roc_auc').fit(X_train, y_train)
SVMResults = pd.DataFrame(clf_svm.cv_results_)
SVMResults.sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
3,1,{'C': 3.0},0.85708
2,2,{'C': 1.5},0.851703
0,3,{'C': 0.5},0.85014
1,4,{'C': 1.0},0.844237


In [63]:
svm = SVC(kernel='linear', C=3.0).fit(X_train, y_train)
predicts_svm = svm.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_svm), 4)
acc = round(accuracy_score(y_test, predicts_svm), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 0.6848 ACC: 0.709


### 5. LDA

In [64]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True).fit(X_train, y_train)
predicts_lda = lda.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_lda), 4)
acc = round(accuracy_score(y_test, predicts_lda), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 0.7127 ACC: 0.7388


### Random Forest

In [65]:
clf_rf = GridSearchCV(RandomForestClassifier(), {'n_estimators':[10, 50, 100, 500, 800, 1000]}, cv=10, scoring='roc_auc').fit(X_train, y_train)
RFResults = pd.DataFrame(clf_rf.cv_results_)
RFResults.sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score']]

Unnamed: 0,rank_test_score,params,mean_test_score
2,1,{'n_estimators': 100},0.881074
1,2,{'n_estimators': 50},0.880886
4,3,{'n_estimators': 800},0.880123
5,4,{'n_estimators': 1000},0.880012
3,5,{'n_estimators': 500},0.87842
0,6,{'n_estimators': 10},0.859494


In [66]:
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
predicts_rf = rf.predict(X_test)
auc = round(roc_auc_score(y_test, predicts_rf), 4)
acc = round(accuracy_score(y_test, predicts_rf), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 0.7329 ACC: 0.7463


## Modelo Final

El modelo con mejor desempeño es el random forest

In [67]:
joblib.dump(rf, 'final_model/logit_titanic.joblib')

['final_model/logit_titanic.joblib']