In [75]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.under_sampling import RandomUnderSampler

In [70]:

hsptl_train_df = pd.read_parquet("../data/processed/hsptl_train.parquet")
X = hsptl_train_df.drop(['StayLength'], axis=1)
y = hsptl_train_df['StayLength']
rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(X, y)

In [71]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [56]:

logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
y_pred = logreg.predict(X_test)


In [57]:
pprint(logreg.score(X_test,y_test))
confusion_matrix(y_test,y_pred)

0.5839182326875182


array([[21643,  9159],
       [16569, 14463]], dtype=int64)

In [58]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
depth = []
for i in range(3,20):
    dtc = DecisionTreeClassifier(max_depth=i)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=dtc, X=X_train, y=y_train, cv=7, n_jobs=4)
    depth.append((i,scores.mean()))
pprint(depth)


[(3, 0.6289755375327061),
 (4, 0.6550640610617482),
 (5, 0.6550640610617482),
 (6, 0.6619812535026889),
 (7, 0.6665511350111605),
 (8, 0.6696978427515916),
 (9, 0.6792026527615818),
 (10, 0.6824648593762358),
 (11, 0.6861845446952118),
 (12, 0.6886196609931314),
 (13, 0.6928337341510835),
 (14, 0.6940951895275248),
 (15, 0.6941876052467146),
 (16, 0.6937532661928286),
 (17, 0.6911564309054749),
 (18, 0.6895622758909917),
 (19, 0.6868221917252394)]


In [60]:
dtc = DecisionTreeClassifier(max_depth=15)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[37825,  8441],
       [19777, 26708]], dtype=int64)

In [80]:

depth = []
for i in range(10,15):
    rfc = RandomForestClassifier(max_depth=i)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=rfc, X=X_train, y=y_train, cv=7, n_jobs=4)
    depth.append((i,scores.mean()))


In [79]:
pprint(depth)


[(2, 0.6126897201572153),
 (3, 0.6142584411055829),
 (4, 0.6320036914014777),
 (5, 0.6462313951628952),
 (6, 0.6553486412231487),
 (7, 0.6582879900133174),
 (8, 0.6625777275924606),
 (9, 0.6689820094811921)]


In [61]:

abc = AdaBoostClassifier(n_estimators=100, base_estimator=logreg, learning_rate=1)
model = abc.fit(X_train, y_train)
y_pred = model.predict(X_test)
pprint(f'Accuracy: {accuracy_score(y_test,y_pred)}')




'Accuracy: 0.5506247911073735'


In [62]:
confusion_matrix(y_test,y_pred)

array([[28486, 17780],
       [23900, 22585]], dtype=int64)

In [None]:
depth = []
for i in range(5,9):
    dtc = DecisionTreeClassifier(max_depth=i)
    abc = AdaBoostClassifier(n_estimators=50, estimator=dtc, learning_rate=1)
    model = abc.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=abc, X=X_train, y=y_train, cv=5, n_jobs=4)
    depth.append((i,scores.mean()))
pprint(depth)


In [64]:
dtc = DecisionTreeClassifier(max_depth=6)
abc = AdaBoostClassifier(n_estimators=50, estimator=dtc, learning_rate=1)
abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[36940,  9326],
       [18623, 27862]], dtype=int64)

In [73]:
mnb = MultinomialNB().fit(X_train, y_train)
pprint(f'score on test: {str(mnb.score(X_test, y_test))}')
pprint(f'score on train: {str(mnb.score(X_train, y_train))}')
y_pred = mnb.predict(X_test)
confusion_matrix(y_test,y_pred)

'score on test: 0.5955299673318886'
'score on train: 0.5963797941245441'


array([[23022,  7780],
       [17230, 13802]], dtype=int64)

In [72]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
pprint(f'Accuracy: {accuracy_score(y_test,y_pred)}')
confusion_matrix(y_test,y_pred)

'Accuracy: 0.5850988129508038'


array([[21808,  8994],
       [16661, 14371]], dtype=int64)

In [65]:
ss = StandardScaler()
scaX = pd.DataFrame(ss.fit_transform(X), columns = X.columns)
scaX_train, scaX_test, y_train, y_test = train_test_split(scaX, y, test_size=0.2, random_state=42)

In [67]:

svc = LinearSVC(C=1, random_state=42)
svc.fit(scaX_train, y_train)
pprint(svc.score(scaX_test, y_test))

0.5103017757220947




In [68]:
y_pred = svc.predict(scaX_test)
confusion_matrix(y_test,y_pred)

array([[21805,  8997],
       [16658, 14374]], dtype=int64)

In [None]:
clf = StackingClassifier()
dtc = DecisionTreeClassifier(max_depth=6)
estimators = [
    'dtc', abc = AdaBoostClassifier(n_estimators=50, estimator=dtc, learning_rate=1),
    'mnb', mnb
    ''
]