## Import packages

In [130]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # Higher resolution figures
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")

## Load datasets

### Heart Attack

In [107]:
cardio = pd.read_csv('C:/Users/panyu/Desktop/COGS 118A/Final Project/data/cardio.csv', delimiter = ';')
cardio = cardio.drop(columns = ['id'])
cardio.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [108]:
cardio.shape

(70000, 12)

In [110]:
cardio.isnull().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [111]:
cardio['age'] = cardio['age'].apply(lambda x: int(x/365))
cardio.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,47,1,156,56.0,100,60,1,1,0,0,0,0


In [112]:
cardio.columns

Index(['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

## Feature Engineering

In [305]:
# How many combinations are there to search through?
parameters = {
    #'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga', 'lbfgs'],
    'lr__penalty': ['l1', 'l2', 'elasticnet', 'none']
}

In [306]:
pl.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'trans', 'lr', 'trans__n_jobs', 'trans__remainder', 'trans__sparse_threshold', 'trans__transformer_weights', 'trans__transformers', 'trans__verbose', 'trans__one-hot', 'trans__one-hot__categories', 'trans__one-hot__drop', 'trans__one-hot__dtype', 'trans__one-hot__handle_unknown', 'trans__one-hot__sparse', 'lr__C', 'lr__class_weight', 'lr__dual', 'lr__fit_intercept', 'lr__intercept_scaling', 'lr__l1_ratio', 'lr__max_iter', 'lr__multi_class', 'lr__n_jobs', 'lr__penalty', 'lr__random_state', 'lr__solver', 'lr__tol', 'lr__verbose', 'lr__warm_start'])

In [307]:
cate_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

In [308]:
# Pipeline
ct = ColumnTransformer([('one-hot', OneHotEncoder(sparse=False), cate_cols)], remainder = 'passthrough')

○ Accuracy
○ Precision
○ Sensitivity/Recall
○ Specificity
○ F1 score
○ ROC curve
○ squared error

In [309]:
best_params = []
accuracy = []
prec = []
recall = []
specificity = []
f1 = []
auc = []
logloss = []

In [310]:
for i in range(0, 5):
    pl = Pipeline([('trans', ct), ('lr', LogisticRegression())])
    grids = GridSearchCV(pl, parameters, cv = 5)
    # split train and test data
    x_train, x_test, y_train, y_test = train_test_split(cardio.drop(columns = ['cardio']),
                                                    cardio['cardio'], test_size = 0.33)
    grids.fit(x_train, y_train)
    best_params.append(grids.best_params_['lr__penalty'])
    y_pred = grids.predict(x_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    prec.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    specificity.append(recall_score(y_test,y_pred, pos_label=0))
    f1.append(f1_score(y_test, y_pred))
    auc.append(roc_auc_score(y_test, y_pred))
    logloss.append(log_loss(y_test, y_pred))

In [319]:
pd.DataFrame(list(zip(best_params, accuracy, prec, recall, specificity, f1, auc, logloss)), 
             columns=['best_params', 'accuracy', 'precision', 'recall', 'specificity', 'f1', 'auc', 'logloss'])

Unnamed: 0,best_params,accuracy,precision,recall,specificity,f1,auc,logloss
0,none,0.707316,0.72689,0.660349,0.753902,0.692024,0.707126,10.109045
1,l2,0.710216,0.72688,0.67164,0.748639,0.698169,0.71014,10.00887
2,none,0.716104,0.739781,0.66983,0.762707,0.70307,0.716269,9.805519
3,l2,0.70658,0.721487,0.668522,0.74429,0.693995,0.706406,10.134467
4,l2,0.704675,0.724309,0.658011,0.751057,0.68957,0.704534,10.200253


### Australia Rain

In [22]:
aus = pd.read_csv('C:/Users/panyu/Desktop/COGS 118A/Final Project/data/weatherAUS.csv')

In [23]:
aus.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [26]:
aus.shape

(145460, 23)

In [25]:
aus.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [28]:
aus.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object