El notebook a continuación investiga la precisión de multiples modelos de clasificación para ver cuales se ajustan mejor al dataset dado, utilizando matrices de confusión y puntajes generados mediante el modulo **sklearn** para este proposito.

In [34]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, RandomTreesEmbedding
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline

In [35]:

hsptl_train_df = pd.read_parquet("../data/processed/hsptl_train.parquet")
X = hsptl_train_df.drop(['StayLength'], axis=1)
y = hsptl_train_df['StayLength']
rus = RandomOverSampler(random_state=42)
X, y = rus.fit_resample(X, y)

In [36]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Logistic Regression

In [37]:

logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
y_pred = logreg.predict(X_test)
pprint(logreg.score(X_test,y_test))
confusion_matrix(y_test,y_pred)


0.5843961357385458


array([[35897, 15307],
       [27154, 23809]], dtype=int64)

# Decision Tree Classifier
Antes de evaluar, se debe averiguar cual es la profundidad optima del arbol, tomando el puntaje dado por cross_val_score en mente:

In [38]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
depth = []
for i in range(3,20):
    dtc = DecisionTreeClassifier(max_depth=i)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=dtc, X=X_train, y=y_train, cv=7, n_jobs=4)
    depth.append((i,scores.mean()))
pprint(depth)


[(3, 0.6277692935045833),
 (4, 0.6538835836487082),
 (5, 0.6538835836487082),
 (6, 0.6610707435296829),
 (7, 0.6659591288640047),
 (8, 0.6720164837788277),
 (9, 0.6765441199567785),
 (10, 0.6790302674844657),
 (11, 0.685960143658832),
 (12, 0.6942659319738073),
 (13, 0.6985726328917916),
 (14, 0.7028485813481868),
 (15, 0.7058213201411981),
 (16, 0.7083494110604345),
 (17, 0.7110425008197626),
 (18, 0.7140460077312961),
 (19, 0.7178129793945641)]


In [40]:
dtc = DecisionTreeClassifier(max_depth=20)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
pprint(accuracy_score(y_test,y_pred))
confusion_matrix(y_test,y_pred)

0.7250766721044045


array([[62241, 14483],
       [27649, 48877]], dtype=int64)

# RandomForestClassifier
Igual que con los arboles de decisión, evaluamos cual es la profundidad apropiada:

In [41]:

depth = []
for i in range(14,21):
    rfc = RandomForestClassifier(max_depth=i)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=rfc, X=X_train, y=y_train, cv=7, n_jobs=4)
    depth.append((i,scores.mean()))
pprint(depth)

In [25]:
rfc = RandomForestClassifier(max_depth=20)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[25764,  5038],
       [13930, 17102]], dtype=int64)

# AdaBoostClassifier

In [61]:

abc = AdaBoostClassifier(n_estimators=100, base_estimator=logreg, learning_rate=1)
model = abc.fit(X_train, y_train)
y_pred = model.predict(X_test)
pprint(f'Accuracy: {accuracy_score(y_test,y_pred)}')
confusion_matrix(y_test,y_pred)



'Accuracy: 0.5506247911073735'


# AdaBoostClassifier con DecisionTree

In [None]:
depth = []
for i in range(5,9):
    dtc = DecisionTreeClassifier(max_depth=i)
    abc = AdaBoostClassifier(n_estimators=50, estimator=dtc, learning_rate=1)
    model = abc.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=abc, X=X_train, y=y_train, cv=5, n_jobs=4)
    depth.append((i,scores.mean()))
pprint(depth)


In [12]:
dtc = DecisionTreeClassifier(max_depth=6)
abc = AdaBoostClassifier(n_estimators=50, estimator=dtc, learning_rate=1)
abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[24977,  5825],
       [12786, 18246]], dtype=int64)

# AdaBoost con RandomTree

In [9]:
depth = []
for i in range(5,9):
    rfc = RandomForestClassifier(max_depth=i)
    abc = AdaBoostClassifier(n_estimators=50, estimator=dtc, learning_rate=1)
    model = abc.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=abc, X=X_train, y=y_train, cv=5, n_jobs=4)
    depth.append((i,scores.mean()))
pprint(depth)

[(5, 0.6974172617677727),
 (6, 0.6973889535328992),
 (7, 0.69770027864684),
 (8, 0.6977083648457201)]


In [10]:
rfc = RandomForestClassifier(max_depth=8)
abc = AdaBoostClassifier(n_estimators=50, estimator=dtc, learning_rate=1)
abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[24981,  5821],
       [12788, 18244]], dtype=int64)

# Multinomial Naive Bayes

In [73]:
mnb = MultinomialNB().fit(X_train, y_train)
pprint(f'score on test: {str(mnb.score(X_test, y_test))}')
pprint(f'score on train: {str(mnb.score(X_train, y_train))}')
y_pred = mnb.predict(X_test)
confusion_matrix(y_test,y_pred)

'score on test: 0.5955299673318886'
'score on train: 0.5963797941245441'


array([[23022,  7780],
       [17230, 13802]], dtype=int64)

In [72]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
pprint(f'Accuracy: {accuracy_score(y_test,y_pred)}')
confusion_matrix(y_test,y_pred)

'Accuracy: 0.5850988129508038'


array([[21808,  8994],
       [16661, 14371]], dtype=int64)

# Linear Support Vector Classifier
Para este clasificador, se debe antes usar el StandardScaler en nuestros datos:

In [5]:
ss = StandardScaler()
scaX = pd.DataFrame(ss.fit_transform(X), columns = X.columns)
scaX_train, scaX_test, y_train, y_test = train_test_split(scaX, y, test_size=0.2, random_state=42)

In [67]:

lsvc = LinearSVC(C=1, random_state=42)
lsvc.fit(scaX_train, y_train)
pprint(lsvc.score(scaX_test, y_test))

0.5103017757220947




In [68]:
y_pred = lsvc.predict(scaX_test)
confusion_matrix(y_test,y_pred)

array([[21805,  8997],
       [16658, 14374]], dtype=int64)

# Stacking utilizando arboles de decisión
Este StackingClassifier utiliza Decision Trees y Random Trees.

In [20]:
dtc = DecisionTreeClassifier(max_depth=15)
rfc = RandomForestClassifier(max_depth=20)


In [21]:
estimators = [
    ('dtc', dtc),
    ('rfc', rfc),
]
clf = StackingClassifier(estimators=estimators)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
pprint(clf.score(X_test, y_test))
y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

0.7002619917844551


array([[24709,  6093],
       [12441, 18591]], dtype=int64)

# Bagging utilizando Random Forests

In [32]:
bclf = BaggingClassifier(estimator=rfc, n_estimators=25, random_state=42)
bclf.fit(X_train,y_train)
y_pred = bclf.predict(X_test)
pprint(accuracy_score(y_pred,y_test))
confusion_matrix(y_test,y_pred)

0.6945369861241388


array([[26270,  4532],
       [14356, 16676]], dtype=int64)

# Bagging utilizando Decision Trees

In [22]:
bclf = BaggingClassifier(estimator=dtc, n_estimators=25, random_state=42)
bclf.fit(X_train,y_train)
y_pred = bclf.predict(X_test)
pprint(accuracy_score(y_pred,y_test))
confusion_matrix(y_test,y_pred)

0.7097874955526086


array([[25924,  4878],
       [13067, 17965]], dtype=int64)

# Gradient Boosting

In [27]:
gclf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
gclf.fit(X_train,y_train)
pprint(gclf.score(X_test,y_test))
y_pred = gclf.predict(X_test)
confusion_matrix(y_test,y_pred)

0.651195135362422


array([[27559,  3243],
       [18325, 12707]], dtype=int64)

# Gradient Boosting con Logistic Regression (usando un pipeline)

In [30]:
X_train_ensemble, X_train_linear, y_train_ensemble, y_train_linear = train_test_split(
    X_train, y_train, test_size=0.5, random_state=10
)
def gbdt_apply(X, model):
    return model.apply(X)[:, :, 0]
    
gbdt_leaves_yielder = FunctionTransformer(
    gbdt_apply, kw_args={"model": gclf}
)
gbdt_model = make_pipeline(
    gbdt_leaves_yielder,
    OneHotEncoder(handle_unknown="ignore"),
    LogisticRegression(max_iter=1000),
)




In [31]:
gbdt_model.fit(X_train_linear, y_train_linear)
y_pred = gbdt_model.predict(X_test)
pprint(accuracy_score(y_test,y_pred))
confusion_matrix(y_test,y_pred)



0.652990264255911


array([[26441,  4361],
       [17096, 13936]], dtype=int64)