<a href="https://colab.research.google.com/github/arifcanaksoy/MachineLearning_H1N1/blob/master/Data_Driven_Flu_Shot_DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Flu Shot Learning

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf 
tf.test.gpu_device_name() 

''

In [2]:
tf.__version__

'2.3.0'

# Data Preprocessing

## Importing the dataset

In [10]:
features_dataset = pd.read_csv('training_set_features.csv', index_col="respondent_id")
test_dataset = pd.read_csv('test_set_features.csv', index_col="respondent_id")
labels_dataset = pd.read_csv('training_set_labels.csv', index_col="respondent_id")

In [11]:
X = features_dataset.iloc[:, :].values
X_test1 = test_dataset.iloc[:, :].values
y_h = labels_dataset.iloc[:,0 ].values
y_s = labels_dataset.iloc[:,-1 ].values

## Missing Data

In [12]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X[:,:] = imputer.fit_transform(X[:,:])

In [58]:
X_test1[:,:] = imputer.fit_transform(X_test1[:,:])

## Encoding Categorical Data




In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [20,21,22,23,24,25,26,27,28,29,30,33,34])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X_test1 = np.array(ct.fit_transform(X_test1))

In [15]:
RANDOM_SEED = 6

In [16]:
print(X)

[[0.0 1.0 0.0 ... 1.0 0.0 0.0]
 [0.0 0.0 0.0 ... 2.0 0.0 0.0]
 [0.0 1.0 0.0 ... 1.0 2.0 0.0]
 ...
 [0.0 1.0 0.0 ... 4.0 0.0 0.0]
 [0.0 1.0 0.0 ... 1.0 1.0 0.0]
 [1.0 0.0 0.0 ... 1.0 1.0 0.0]]


# Hyper parameter Tuning (SVM RBF)

## Training the dataset

In [17]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [18]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [19]:
scores = []
from sklearn.model_selection import GridSearchCV
for model_name, mp in model_params.items():
    clf_h =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf_h.fit(X,y_h)
    scores.append({
        'model': model_name,
        'best_score': clf_h.best_score_,
        'best_params': clf_h.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.839705,"{'C': 20, 'kernel': 'rbf'}"
1,random_forest,0.819523,{'n_estimators': 10}
2,logistic_regression,0.836335,{'C': 5}


In [21]:
scores = []
from sklearn.model_selection import RandomizedSearchCV
for model_name, mp in model_params.items():
    rs_h = RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    rs_h.fit(X,y_h)
    scores.append({
        'model': model_name,
        'best_score': rs_h.best_score_,
        'best_params': rs_h.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df



Unnamed: 0,model,best_score,best_params
0,svm,0.839705,"{'kernel': 'rbf', 'C': 20}"
1,random_forest,0.820908,{'n_estimators': 10}
2,logistic_regression,0.836335,{'C': 5}


## Predicting the test result

In [22]:
y_prob = clf_h.predict(X)

In [23]:
y_prob_h = clf_h.predict(X_test1)

## Accuracy (Linear Regression for both H1N1 and seasonal flu)

In [24]:
from sklearn.metrics import roc_curve, roc_auc_score
roc_auc_score(y_h, y_prob)

0.6898149090823104

In [57]:
h1n1_preds=clf_h.predict_proba(X_test1)[:, 1]

## Building the ANN

### Initializing the ANN

In [25]:
ann_s = tf.keras.models.Sequential()

### Adding the input layer and the first hidden layer

In [26]:
ann_s.add(tf.keras.layers.Dense(64, activation='selu'))

### Adding the second and third hidden layer

In [27]:
ann_s.add(tf.keras.layers.Dense(units=32, activation='selu'))

In [28]:
ann_s.add(tf.keras.layers.Dense(units=16, activation='selu'))

### Adding the output layer

In [29]:
ann_s.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

## Training the ANN

### Compiling the ANN

In [30]:
ann_s.compile(optimizer = 'SGD', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [31]:
X = np.asarray(X).astype(np.float32)
X_test1 = np.asarray(X_test1).astype(np.float32)
y_h = np.asarray(y_h).astype(np.float32)
y_s = np.asarray(y_s).astype(np.float32)

### Training the ANN on the Training set

In [32]:
ann_s.fit(X, y_s, batch_size = 1024, epochs = 1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x7f297503ceb8>

## Part 4 - Making the predictions and evaluating the model

### Predicting the Test set results

In [33]:
y_prob = ann_s.predict(X)

In [34]:
y_prob_s = ann_s.predict(X_test1)

### Accuracy

In [35]:
from sklearn.metrics import roc_curve, roc_auc_score
roc_auc_score(y_s, y_prob)

0.8747295521915756

# Submission

In [36]:
y_prob_h

array([0, 0, 0, ..., 0, 0, 1])

In [37]:
y_prob_s

array([[0.15799427],
       [0.03544047],
       [0.72618365],
       ...,
       [0.17482886],
       [0.39689863],
       [0.47483295]], dtype=float32)

In [38]:
submission_dataset = pd.read_csv('submission_format.csv', index_col="respondent_id")
submission_dataset.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.5,0.7
26708,0.5,0.7
26709,0.5,0.7
26710,0.5,0.7
26711,0.5,0.7


In [52]:
np.testing.assert_array_equal(test_dataset.index.values, submission_dataset.index.values)
submission_dataset["h1n1_vaccine"] = h1n1_preds
submission_dataset["seasonal_vaccine"] = y_prob_s
submission_dataset.head(50)

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.056879,0.157994
26708,0.048547,0.03544
26709,0.417309,0.726184
26710,0.477829,0.890142
26711,0.152634,0.281989
26712,0.462331,0.949748
26713,0.325217,0.509352
26714,0.150011,0.12878
26715,0.041929,0.067229
26716,0.199894,0.888604


In [53]:
submission_dataset.to_csv('team_submission.csv', index=True)

In [54]:
!head team_submission.csv

respondent_id,h1n1_vaccine,seasonal_vaccine
26707,0.056878818007652356,0.15799427
26708,0.04854719342500879,0.035440475
26709,0.41730860273339954,0.72618365
26710,0.4778291195212865,0.89014196
26711,0.15263383623995636,0.2819885
26712,0.46233073878361064,0.94974804
26713,0.32521741672175886,0.50935245
26714,0.15001120414746144,0.12878042
26715,0.041928763361663804,0.067228645


In [55]:
from google.colab import files
files.download('team_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>