In [1]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score as cvs
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix, precision_score, recall_score 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC 
import pandas as pd
from pandas import ExcelFile
from mpl_toolkits.mplot3d import Axes3D
import tensorflow as tf
import scipy.io as io
from sklearn.model_selection import GridSearchCV as gs
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier 

In [2]:
def show_class(output):
    if output == 1:
        print("Mild asthma")
    elif output == 2:
        print("Moderate asthma")
    else:
        print("Severe asthma")

In [3]:
xl = pd.ExcelFile("sarp.xlsx")
ds = xl.parse('sarp')
X_uncleaned = ds.loc[:, "Baseline_preDrug_FEV1pp":"Baseline_preDrug_FEV1_FVC"]
imputer = Imputer(strategy="median")
imputer.fit(X_uncleaned)
X_cleaned = imputer.transform(X_uncleaned)
X = pd.DataFrame(X_cleaned, columns=X_uncleaned.columns).astype(np.float64)
y = ds.loc[:,"ageasthonset"].astype(np.float64)
stds = ds['status_factorized'].astype(np.int64)
X2 = X
X2 = X["Baseline_preDrug_FEV1_FVC"]
X2 = pd.DataFrame(X2)
X2 = X2.join(y)
X2 = X2.join(ds["gender"])
y2 = stds
y2 = y2.values.ravel()
y2pd = pd.DataFrame(y2)
X2pd = X2
X2 = pd.DataFrame.as_matrix(X2)
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.33)

In [4]:
rfc_clf = RandomForestClassifier( n_estimators = 500, max_leaf_nodes = 16, n_jobs =-1) 

In [11]:
rfc_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
res = rfc_clf.predict([[.65, 45, 1]])
show_class(res[0])

Severe asthma


In [13]:
y_pred = rfc_clf.predict(X_test)
y_pred.shape

(449,)

In [16]:
print(confusion_matrix(y_test, y_pred))
print(f1(y_test,y_pred,average='micro'))
print(precision_score(y_test, y_pred, average='micro'))
print(recall_score(y_test, y_pred, average='micro'))

[[131   5  24]
 [ 25   7  41]
 [ 65  12 139]]
0.616926503341
0.616926503341
0.616926503341


In [17]:
param_grid = [ {'n_estimators': [50, 100, 150, 200, 250], 'max_leaf_nodes': [3, 5, 8, 12, 16]} ]
grid_search = gs(rfc_clf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'max_leaf_nodes': 16, 'n_estimators': 200}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [18]:
rfc_clf = RandomForestClassifier( n_estimators = 200, max_leaf_nodes = 16, n_jobs =-1) 
rfc_clf.fit(X_train, y_train)
y_pred = rfc_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(f1(y_test,y_pred,average='micro'))
print(precision_score(y_test, y_pred, average='micro'))
print(recall_score(y_test, y_pred, average='micro'))

[[131   5  24]
 [ 25   6  42]
 [ 66  11 139]]
0.614699331849
0.614699331849
0.614699331849


In [19]:
ada_clf = AdaBoostClassifier(rfc_clf)

In [20]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [21]:
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(f1(y_test,y_pred,average='micro'))
print(precision_score(y_test, y_pred, average='micro'))
print(recall_score(y_test, y_pred, average='micro'))

[[112  14  34]
 [ 20  22  31]
 [ 55  52 109]]
0.541202672606
0.541202672606
0.541202672606
