# Li's Code

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn import *
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import time
import sys

In [7]:
# Before you run the program, please make sure that the current working file directory is the file directory I submitted. 
# Please modify the path in the new_directory to change the current working directory

import os
new_directory = 'D:\诺丁汉大学 学习资料\诺丁汉大学  计算机科学 学习资料\Data science and machine learning\CW2\Coursework2'
os.chdir(new_directory)

current_directory = os.getcwd()
print(current_directory)

D:\诺丁汉大学 学习资料\诺丁汉大学  计算机科学 学习资料\Data science and machine learning\CW2\Coursework2


In [8]:
df = pd.read_csv(r"Raw Data/training_set_features.csv")
df1 = pd.read_csv(r"Raw Data/training_set_labels.csv")
df2 = pd.read_csv(r"Raw Data/test_set_features.csv")

# Training set preprocessing

In [3]:
def preprocessing(dataset):
    #Replace all null values with nan
    dataset = dataset.fillna(value = np.NaN)   
    
    # Replace the empty values from h1n1_concern to opinion_seas_sick_from_vacc, education, income_poverty with the mode
    header1 = ['h1n1_concern', 'h1n1_knowledge',
           'behavioral_antiviral_meds', 'behavioral_avoidance',
           'behavioral_face_mask', 'behavioral_wash_hands',
           'behavioral_large_gatherings', 'behavioral_outside_home',
           'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
           'chronic_med_condition', 'child_under_6_months', 'health_worker',
           'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
           'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
           'opinion_seas_risk', 'opinion_seas_sick_from_vacc',"education","income_poverty","marital_status","rent_or_own","employment_status",\
             "household_adults","household_children"]
    for n in header1:
        n_mode = dataset[n].mode()[0]
        dataset[n].replace(np.nan,n_mode,inplace = True) 
        
    #There are too many missing values, delete these columns
    header2 = ["health_insurance","employment_industry","employment_occupation"]
    for h in header2:
        dataset.drop(h,axis = 1, inplace = True)  
        
    # Convert the data type of respondent_id to 'opinion_seas_sick_from_vacc' to int
    header3 = ['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
           'behavioral_antiviral_meds', 'behavioral_avoidance',
           'behavioral_face_mask', 'behavioral_wash_hands',
           'behavioral_large_gatherings', 'behavioral_outside_home',
           'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
           'chronic_med_condition', 'child_under_6_months', 'health_worker',
           'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
           'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
           'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

    for h in header3:
        dataset[h] = dataset[h].astype('int')
        
    # Label encoding for age_group education income_poverty
    label_encoder = LabelEncoder()
    dataset['age_group_encoded'] = label_encoder.fit_transform(dataset['age_group'])
    dataset['income_poverty_encoded'] = label_encoder.fit_transform(dataset['income_poverty'])

    education_value = ['< 12 Years','12 Years','Some College','College Graduate']
    m = 0
    for v in education_value:
        dataset['education'].replace(v,m,inplace = True)
        m += 1

    # One hot encoding for 'race', 'sex', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'census_msa'
    header4 = ['race','sex','marital_status','rent_or_own','employment_status','hhs_geo_region','census_msa']
    m = 0
    for h in header4:
        header4[m] = pd.get_dummies(dataset[h])
        dataset.drop(h,axis = 1,inplace = True)
        dataset = pd.concat([dataset, header4[m]], axis=1)
        m+=1

    header_should_be_delete = ['age_group','income_poverty']
    for h in header_should_be_delete:
        dataset = dataset.drop(h,axis = 1)

        
    return dataset
    
    
df = preprocessing(df)  
    
    
    
    

In [4]:
# Combine training features and labels
df1.drop('respondent_id',axis = 1,inplace = True)
df = pd.concat([df,df1],axis = 1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int32  
 1   h1n1_concern                 26707 non-null  int32  
 2   h1n1_knowledge               26707 non-null  int32  
 3   behavioral_antiviral_meds    26707 non-null  int32  
 4   behavioral_avoidance         26707 non-null  int32  
 5   behavioral_face_mask         26707 non-null  int32  
 6   behavioral_wash_hands        26707 non-null  int32  
 7   behavioral_large_gatherings  26707 non-null  int32  
 8   behavioral_outside_home      26707 non-null  int32  
 9   behavioral_touch_face        26707 non-null  int32  
 10  doctor_recc_h1n1             26707 non-null  int32  
 11  doctor_recc_seasonal         26707 non-null  int32  
 12  chronic_med_condition        26707 non-null  int32  
 13  child_under_6_mo

# Feature Selection

In [5]:
# Correlation analysis, select variables with correlation greater than 0.1 and less than -0.1 to train the model
h1n1_vaccine_model_training_value = []
for h in df.columns.tolist():
    if df[[h,'h1n1_vaccine']].corr().iloc[0,1]>=0.1 or df[[h,'h1n1_vaccine']].corr().iloc[0,1] <=-0.1:
        h1n1_vaccine_model_training_value.append(h)
h1n1_vaccine_model_training_value.remove('h1n1_vaccine')
h1n1_vaccine_model_training_value.remove('seasonal_vaccine')
print('h1n1_vaccine_model_training_value: ')
print(h1n1_vaccine_model_training_value)
print('\n')
    
seasonal_vaccine_model_training_value = []
for h in df.columns.tolist():
    if df[[h,'seasonal_vaccine']].corr().iloc[0,1]>=0.1 or df[[h,'seasonal_vaccine']].corr().iloc[0,1] <=-0.1:
        seasonal_vaccine_model_training_value.append(h)
seasonal_vaccine_model_training_value.remove('h1n1_vaccine')
seasonal_vaccine_model_training_value.remove('seasonal_vaccine')
print('seasonal_vaccine_model_training_value:')
print(seasonal_vaccine_model_training_value)
print('\n')
    

h1n1_vaccine_model_training_value: 
['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'health_worker', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_seas_vacc_effective', 'opinion_seas_risk']


seasonal_vaccine_model_training_value:
['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'health_worker', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'household_children', 'age_group_encoded', 'White', 'Own', 'Rent', 'Employed', 'Not in Labor Force']




In [6]:
# Divide df into training set and test set
from sklearn.model_selection import train_test_split

h1n1_vaccine_train = df[h1n1_vaccine_model_training_value]
seasonal_vaccine_train = df[seasonal_vaccine_model_training_value]
y = df.iloc[:,-2:]

# Randomly select 30% of the row data as the test set, and the rest as the training set
h1n1_vaccine_x_train,h1n1_vaccine_x_test,h1n1_vaccine_y_train,h1n1_vaccine_y_test = train_test_split(h1n1_vaccine_train,y.iloc[:,0],test_size = 0.3, random_state = 42)

seasonal_vaccine_x_train,seasonal_vaccine_x_test,seasonal_vaccine_y_train,seasonal_vaccine_y_test = train_test_split(seasonal_vaccine_train,y.iloc[:,1],test_size = 0.3, random_state = 42)




# Logistic Regression Model

H1N1

In [8]:
# H1N1 Predict
h1n1_vaccine_log = LogisticRegression(C=0.1)

# define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'penalty': ['l1', 'l2'], 
    'solver': ['liblinear', 'saga']  
}

#Grid Search Optimization Parameters
h1n1_vaccine_log = GridSearchCV(h1n1_vaccine_log, param_grid, cv=5)

#Cross-validation to evaluate model performance
scores = cross_val_score(h1n1_vaccine_log,h1n1_vaccine_x_test,h1n1_vaccine_y_test,cv = 5,scoring = 'accuracy')
for fold, score in enumerate(scores):
    print(f"Fold {fold +1}: accuracy = {score}")
mean_score = scores.mean()
print("Mean scores: ",mean_score)

start_time = time.time()

#Training model
h1n1_vaccine_log.fit(h1n1_vaccine_x_train,h1n1_vaccine_y_train)
h1n1_vaccine_log_predict = h1n1_vaccine_log.predict(h1n1_vaccine_x_test)

end_time = time.time()
print('The R-square is: ',h1n1_vaccine_log.score(h1n1_vaccine_x_test,h1n1_vaccine_y_test))
print("H1n1 Best Scores: " , h1n1_vaccine_log.best_score_)
print("程序运行时间为：",end_time - start_time, "秒")

Fold 1: accuracy = 0.851528384279476
Fold 2: accuracy = 0.8228321896444167
Fold 3: accuracy = 0.8378041172800998
Fold 4: accuracy = 0.83832709113608
Fold 5: accuracy = 0.8208489388264669
Mean scores:  0.834268144233308
The R-square is:  0.8353924872082865
H1n1 Best Scores:  0.8313365218552269
程序运行时间为： 7.988971710205078 秒


Seasonal

In [7]:
warnings.simplefilter("ignore")
seasonal_vaccine_log = LogisticRegression()

# define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'penalty': ['l1', 'l2'], 
    'solver': ['liblinear', 'saga']  
}

#Grid Search Optimization Parameters
seasonal_vaccine_log = GridSearchCV(seasonal_vaccine_log, param_grid, cv=5)

#Cross-validation to evaluate model performance
scores = cross_val_score(seasonal_vaccine_log,seasonal_vaccine_x_test,seasonal_vaccine_y_test,cv = 5,scoring = 'accuracy')
for fold, score in enumerate(scores):
    print(f"Fold {fold +1}: accuracy = {score}")
mean_score = scores.mean()
print("Mean scores: ",mean_score)
print("\n")

start_time = time.time()

#Training model
seasonal_vaccine_log.fit(seasonal_vaccine_x_train,seasonal_vaccine_y_train)
seasonal_vaccine_log_predict = seasonal_vaccine_log.predict(seasonal_vaccine_x_test)

end_time = time.time()

print('The R-square is: ',seasonal_vaccine_log.score(seasonal_vaccine_x_test,seasonal_vaccine_y_test))
print("Seasonal Best Scores: " , seasonal_vaccine_log.best_score_)
print("程序运行时间为：",end_time - start_time, "秒")


Fold 1: accuracy = 0.7891453524641298
Fold 2: accuracy = 0.784154709918902
Fold 3: accuracy = 0.7542108546475359
Fold 4: accuracy = 0.7752808988764045
Fold 5: accuracy = 0.7702871410736579
Mean scores:  0.774615791396126


The R-square is:  0.7786097591413952
Seasonal Best Scores:  0.7654331142351433
程序运行时间为： 61.06106233596802 秒


# K-Nearest Neighbour

H1N1

In [29]:
warnings.simplefilter("ignore")

h1n1_vaccine_knn = KNeighborsClassifier()

# define parameter grid
param_grid = {
    'n_neighbors': range(1, 21),  
    'weights': ['uniform', 'distance'],  
    'p': [1, 2]  
}

#Randomized Search Optimization Parameters
h1n1_vaccine_knn = RandomizedSearchCV(h1n1_vaccine_knn, param_grid, n_iter=10, cv=5)

#Cross-validation to evaluate model performance
scores = cross_val_score(h1n1_vaccine_knn,h1n1_vaccine_x_test,h1n1_vaccine_y_test,cv = 5,scoring = 'accuracy')
for fold, score in enumerate(scores):
    print(f"Fold {fold +1}: accuracy = {score}")
mean_score = scores.mean()
print("Mean scores: ",mean_score)

start_time = time.time()


#Model Training
h1n1_vaccine_knn.fit(h1n1_vaccine_x_train,h1n1_vaccine_y_train)
h1n1_vaccine_knn_pred = h1n1_vaccine_knn.predict(h1n1_vaccine_x_test)

end_time = time.time()

h1n1_vaccine_knn_acc = accuracy_score(h1n1_vaccine_y_test, h1n1_vaccine_knn_pred)
print('H1n1 Accuracy:', h1n1_vaccine_knn_acc)
print("H1n1 Best Scores: " , h1n1_vaccine_knn.best_score_)
print("程序运行时间为：",end_time - start_time, "秒")

Fold 1: accuracy = 0.8459139114160948
Fold 2: accuracy = 0.825951341235184
Fold 3: accuracy = 0.8265751715533375
Fold 4: accuracy = 0.83458177278402
Fold 5: accuracy = 0.8139825218476904
Mean scores:  0.8294009437672653
H1n1 Accuracy: 0.8262822912766754
H1n1 Best Scores:  0.8270032688001802
程序运行时间为： 19.775394201278687 秒


Seasonal

In [31]:
seasonal_vaccine_knn = KNeighborsClassifier()

# define parameter grid
param_grid = {
    'n_neighbors': range(1, 21),  
    'weights': ['uniform', 'distance'],  
    'p': [1, 2]  
}

#Randomized Search Optimization Parameters
seasonal_vaccine_knn = RandomizedSearchCV(seasonal_vaccine_knn, param_grid, n_iter=10, cv=5)

#Cross-validation to evaluate model performance
scores = cross_val_score(seasonal_vaccine_knn,seasonal_vaccine_x_test,seasonal_vaccine_y_test,cv = 5,scoring = 'accuracy')
for fold, score in enumerate(scores):
    print(f"Fold {fold +1}: accuracy = {score}")
mean_score = scores.mean()
print("Mean scores: ",mean_score)

start_time = time.time()

#Model Training
seasonal_vaccine_knn.fit(seasonal_vaccine_x_train,seasonal_vaccine_y_train)
seasonal_vaccine_knn_predict = seasonal_vaccine_knn.predict(seasonal_vaccine_x_test)

end_time = time.time()

seasonal_vaccine_knn_acc = accuracy_score(seasonal_vaccine_y_test, seasonal_vaccine_knn_predict)
print('Seasonal Accuracy:', seasonal_vaccine_knn_acc)
print("Seasonal Best Scores: " , seasonal_vaccine_knn.best_score_)
print("程序运行时间为：",end_time - start_time, "秒")

Fold 1: accuracy = 0.7616968184653774
Fold 2: accuracy = 0.7610729881472239
Fold 3: accuracy = 0.7548346849656893
Fold 4: accuracy = 0.7609238451935081
Fold 5: accuracy = 0.7490636704119851
Mean scores:  0.7575184014367567
Seasonal Accuracy: 0.7505303881193062
Seasonal Best Scores:  0.7470315279018561
程序运行时间为： 88.02561664581299 秒


# Decision Tree Regression

H1N1

In [32]:
h1n1_vaccine_DecisionTree = DecisionTreeClassifier()

# define parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

#Grid Search Optimization Parameters
h1n1_vaccine_DecisionTree = GridSearchCV(h1n1_vaccine_DecisionTree, param_grid, cv=5)

#Cross-validation to evaluate model performance
scores = cross_val_score(h1n1_vaccine_DecisionTree,h1n1_vaccine_x_test,h1n1_vaccine_y_test,cv = 5,scoring = 'accuracy')
for fold, score in enumerate(scores):
    print(f"Fold {fold +1}: accuracy = {score}")
mean_score = scores.mean()
print("Mean scores: ",mean_score)

start_time = time.time()

# Model Training
h1n1_vaccine_DecisionTree.fit(h1n1_vaccine_x_train,h1n1_vaccine_y_train)
h1n1_vaccine_DecisionTree_pred = h1n1_vaccine_DecisionTree.predict(h1n1_vaccine_x_test)

end_time = time.time()

H1n1_DecisionTree_acc = accuracy_score(h1n1_vaccine_y_test, h1n1_vaccine_DecisionTree_pred)
print('H1n1 Accuracy:', H1n1_DecisionTree_acc)
print("H1n1 Best Scores: " , h1n1_vaccine_DecisionTree.best_score_)

print("程序运行时间为：",end_time - start_time, "秒")

Fold 1: accuracy = 0.8415470991890206
Fold 2: accuracy = 0.8234560199625702
Fold 3: accuracy = 0.8234560199625702
Fold 4: accuracy = 0.8220973782771536
Fold 5: accuracy = 0.8089887640449438
Mean scores:  0.8239090562872515
H1n1 Accuracy: 0.8317733682765506
H1n1 Best Scores:  0.8299457470466963
程序运行时间为： 2.6750638484954834 秒


Seasonal

In [45]:
seasonal_vaccine_DecisionTree = DecisionTreeClassifier()

#Grid Search Optimization Parameters
seasonal_vaccine_DecisionTree = GridSearchCV(seasonal_vaccine_DecisionTree, param_grid, cv=5)

#Cross-validation to evaluate model performance
scores = cross_val_score(seasonal_vaccine_DecisionTree,seasonal_vaccine_x_test,seasonal_vaccine_y_test,cv = 5,scoring = 'accuracy')
for fold, score in enumerate(scores):
    print(f"Fold {fold +1}: accuracy = {score}")
mean_score = scores.mean()
print("Mean scores: ",mean_score)

start_time = time.time()

#Model Training
seasonal_vaccine_DecisionTree.fit(seasonal_vaccine_x_train,seasonal_vaccine_y_train)
seasonal_vaccine_DecisionTree_pred = seasonal_vaccine_DecisionTree.predict(seasonal_vaccine_x_test)

end_time = time.time()

Seasonal_DecisionTree_acc = accuracy_score(seasonal_vaccine_y_test, seasonal_vaccine_DecisionTree_pred)
print('Seasonal Accuracy:', Seasonal_DecisionTree_acc)
print("Seasonal Best Scores: " , seasonal_vaccine_DecisionTree.best_score_)

print("程序运行时间为：",end_time - start_time, "秒")

Fold 1: accuracy = 0.7804117280099813
Fold 2: accuracy = 0.7698066126013724
Fold 3: accuracy = 0.7560823456019963
Fold 4: accuracy = 0.7721598002496879
Fold 5: accuracy = 0.7553058676654182
Mean scores:  0.7667532708256911
Seasonal Accuracy: 0.7647572694371646
Seasonal Best Scores:  0.7604043449871363
程序运行时间为： 4.94782280921936 秒


# Get the Final Forecast

In [101]:
df2 = preprocessing(df2)

In [109]:
headers5 = ['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'health_worker', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_seas_vacc_effective', 'opinion_seas_risk']
headers6 = ['h1n1_concern', 'h1n1_knowledge', 'behavioral_wash_hands', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'health_worker', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'household_children', 'age_group_encoded', 'White', 'Own', 'Rent', 'Employed', 'Not in Labor Force']
df2_h1n1_input = df2[headers5]
df2_seasonal_input = df2[headers6]

In [119]:
#Logistic Regression Model Forecasting
h1h1_log_proba = h1n1_vaccine_log.predict_proba(df2_h1n1_input)
seasonal_log_proba = seasonal_vaccine_log.predict_proba(df2_seasonal_input)

h1h1_log_Forecast = pd.DataFrame(h1h1_log_proba[:,1])
seasonal_log_Forecast = pd.DataFrame(seasonal_log_proba[:,1])
log_Forecast = pd.concat([h1h1_log_Forecast,seasonal_log_Forecast],axis =1)

headers = ['h1n1_vaccine','seasonal_vaccine']
log_Forecast.columns = headers

# Add ID column
start_id = 26707
end_id = 53414
log_Forecast.insert(0, 'ID', range(start_id, end_id + 1))

log_Forecast.to_csv('D:\诺丁汉大学 学习资料\诺丁汉大学  计算机科学 学习资料\Data science and machine learning\CW2\Result\logistic.csv', index=False)

In [120]:
# K-Nearest Neighbour Regression Model Forecasting
h1h1_knn_proba = h1n1_vaccine_knn.predict_proba(df2_h1n1_input)
seasonal_knn_proba = seasonal_vaccine_knn.predict_proba(df2_seasonal_input)

h1h1_knn_Forecast = pd.DataFrame(h1h1_knn_proba[:,1])
seasonal_knn_Forecast = pd.DataFrame(seasonal_knn_proba[:,1])
knn_Forecast = pd.concat([h1h1_knn_Forecast,seasonal_knn_Forecast],axis =1)

headers = ['h1n1_vaccine','seasonal_vaccine']
knn_Forecast.columns = headers

# Add ID column
start_id = 26707
end_id = 53414
knn_Forecast.insert(0, 'ID', range(start_id, end_id + 1))

knn_Forecast.to_csv('D:\诺丁汉大学 学习资料\诺丁汉大学  计算机科学 学习资料\Data science and machine learning\CW2\Result\K-Nearest Neighbour Regression.csv', index=False)

In [121]:
# K-Nearest Neighbour Regression Model Forecasting
h1h1_DecisionTree_proba = h1n1_vaccine_DecisionTree.predict_proba(df2_h1n1_input)
seasonal_DecisionTree_proba = seasonal_vaccine_DecisionTree.predict_proba(df2_seasonal_input)

h1h1_DecisionTree_Forecast = pd.DataFrame(h1h1_DecisionTree_proba[:,1])
seasonal_DecisionTree_Forecast = pd.DataFrame(seasonal_DecisionTree_proba[:,1])
DecisionTree_Forecast = pd.concat([h1h1_DecisionTree_Forecast,seasonal_DecisionTree_Forecast],axis =1)

headers = ['h1n1_vaccine','seasonal_vaccine']
DecisionTree_Forecast.columns = headers

# Add ID column
start_id = 26707
end_id = 53414
DecisionTree_Forecast.insert(0, 'ID', range(start_id, end_id + 1))

DecisionTree_Forecast.to_csv('D:\诺丁汉大学 学习资料\诺丁汉大学  计算机科学 学习资料\Data science and machine learning\CW2\Result\Decision Tree Regression.csv', index=False)

# Ye's Code

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
#Build and train the svm model
def buildSVM(X,y,kernel='linear',C=0.1,gamma=0.1):
    model = SVC(kernel=kernel,C=C,gamma=gamma,probability=True)
    model.fit(X, y)
    return model

#Build and train the ann model
def buildANN(X,y,activation='relu',hidden_layer_sizes=(70,20),max_iter=2000, solver= 'adam'):
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=max_iter, random_state=42)
    model.fit(X, y)
    return model

# data preprocessing

In [None]:
#data preprocessing
#1.read data
df_train_feature = pd.read_csv(r"Raw Data/training_set_features.csv")
df_train_label = pd.read_csv(r"Raw Data/training_set_labels.csv")
df_test_feature = pd.read_csv(r"Raw Data/test_set_features.csv")
 

#count missing value 
missing_counts = df_train_feature.isna().sum()
#print("information of missing counts:\n",missing_counts)

#distinguish dublicated value
train_duplicated_count = df_train_feature.duplicated().sum()
#print("dublicated number in train dataset:",train_duplicated_count)

test_duplicated_count = df_test_feature.duplicated().sum()
#print("dublicated number in test dataset:",test_duplicated_count)

#find distinct value
unique_counts = df_train_feature.nunique()
#print("train number of distinct value:\n",unique_counts) 

unique_counts = df_test_feature.nunique()
#print("test number of distinct value:\n",unique_counts) 

# 2.Data Cleaning ,Filter out columns with more than 30% missing values
cols_with_few_missing_values = missing_counts[missing_counts >df_train_feature.shape[0]*0.3].index
print("columns with more than 30% missing values are:",list(cols_with_few_missing_values))
#delete these columns
df_train_feature= df_train_feature.drop(columns=['health_insurance', 'employment_industry', 'employment_occupation'],axis=1)
df_test_feature= df_test_feature.drop(columns=['health_insurance', 'employment_industry', 'employment_occupation'],axis=1)
#Columns with really little data use data completion and mode filling
mode = df_train_feature.mode().iloc[0]
df_train_feature.fillna(mode, inplace=True)
df_test_feature.fillna(mode, inplace=True)

# #3.Data Transformation
df_train_feature = pd.get_dummies(df_train_feature, columns=['age_group','education','race','sex','income_poverty',
                                                 'marital_status','rent_or_own','employment_status',
                                                 'hhs_geo_region','census_msa'])
df_test_feature = pd.get_dummies(df_test_feature, columns=['age_group','education','race','sex','income_poverty',
                                                 'marital_status','rent_or_own','employment_status',
                                                 'hhs_geo_region','census_msa'])

# 1. svm

# 1.1 h1n1 vaccine

# 1.1.1 Feature Selection

In [None]:
#4.h1n1 Data Reduction
all_name = df_train_feature.columns.values.tolist() 

#select 20 features
model_h1n1 = SelectKBest(chi2, k=20)
df_train_feature_h1n1 = model_h1n1.fit_transform(df_train_feature, df_train_label['h1n1_vaccine'])
select_name_index_h1n1 = model_h1n1.get_support(indices=True)  
select_name_h1n1 = []
for i in select_name_index_h1n1:
    select_name_h1n1.append(all_name[i])

# confirm label and feature    
df_test_feature_h1n1 = df_test_feature[select_name_h1n1]
X_h1n1_vaccine = df_train_feature_h1n1
y_h1n1_vaccine = df_train_label['h1n1_vaccine'].values

# 1.1.2 Hyperparameter Tuning

In [None]:
#Hyperparameter tuning, using grid search
#very slow
from sklearn.model_selection import GridSearchCV
svm = SVC()
param_grid = {'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(svm, param_grid=param_grid, cv=2, n_jobs=8)
grid_search.fit(X_h1n1_vaccine[0:1000], y_h1n1_vaccine[0:1000])
print('Best Parameters of SVM model in h1n1 vaccine:', grid_search.best_params_)
print('Best Score of SVM model in h1n1 vaccine:', grid_search.best_score_)

# 1.1.3 Kfold

In [None]:
kf = KFold(n_splits=5)
train_predict_score = []
test_predict_score = []

for train_index, test_index in kf.split(X_h1n1_vaccine):
    X_train, X_test = X_h1n1_vaccine[train_index], X_h1n1_vaccine[test_index]
    y_train, y_test = y_h1n1_vaccine[train_index], y_h1n1_vaccine[test_index]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    model = buildSVM(X_train,y_train,kernel='linear',C=0.1,gamma=0.1)
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)
    #y_test_predict_proba =  mlp.predict_proba(X_test)
    train_predict_score.append(model.score(X_train, y_train))
    test_predict_score.append(model.score(X_test,y_test))

#accuracy of h1n1 vaccine prediction
print("accuracy of h1n1 vaccine in train dataset is {:.2f}%".format(np.mean(train_predict_score)* 100))
print("accuracy of h1n1 vaccine in test dataset is {:.2f}%".format(np.mean(test_predict_score)* 100))

# 1.2 Seasonal Vaccine

# 1.2.1 Feature Selection

In [None]:
#4.Data Reduction
all_name = df_train_feature.columns.values.tolist()  
#select 20 features
model_seasonal = SelectKBest(chi2, k=20)
df_train_feature_seasonal = model_seasonal.fit_transform(df_train_feature, df_train_label['seasonal_vaccine'])
select_name_index_seasonal = model_seasonal.get_support(indices=True)  
select_name_seasonal = []
for i in select_name_index_seasonal:
    select_name_seasonal.append(all_name[i])
    
df_test_feature_seasonal = df_test_feature[select_name_seasonal]
X_seasonal_vaccine = df_train_feature_seasonal
y_seasonal_vaccine = df_train_label['seasonal_vaccine'].values

# 1.2.2 Hyperparameter Tuning

In [None]:
#Hyperparameter tuning, using grid search
from sklearn.model_selection import GridSearchCV
svm = SVC()
param_grid = {'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(svm, param_grid=param_grid, cv=2)
grid_search.fit(X_seasonal_vaccine[0:1000], y_seasonal_vaccine[0:1000])
print('Best Parameters of SVM model in seasonal vaccine:', grid_search.best_params_)
print('Best Score of SVM model in seasonal vaccine:', grid_search.best_score_)

# 1.2.3 Kfold

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
train_predict_score = []
test_predict_score = []
#Kfold
for train_index, test_index in kf.split(X_seasonal_vaccine):
    X_train, X_test = X_seasonal_vaccine[train_index], X_seasonal_vaccine[test_index]
    y_train, y_test = y_seasonal_vaccine[train_index], y_seasonal_vaccine[test_index]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    #modelling seasonal vacacine
    model = buildSVM(X_train,y_train,kernel='linear',C=0.1,gamma=0.1)
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)
    train_predict_score.append(model.score(X_train, y_train))
    test_predict_score.append(model.score(X_test,y_test))
    
#accuracy of seasonal prediction
print("accuracy of seasonal vaccine in train dataset is {:.2f}%".format(np.mean(train_predict_score)* 100))
print("accuracy of seasonal vaccine in test dataset is {:.2f}%".format(np.mean(test_predict_score)* 100))

# 1.3 Predict Results

In [None]:
model_h1n1 = buildSVM(X_h1n1_vaccine[0:1000], y_h1n1_vaccine[0:1000],kernel='linear',C=0.1,gamma=0.1)
y_h1n1_predict = model_h1n1.predict(df_test_feature_h1n1.values)
y_h1n1_predict_proba =  model_h1n1.predict_proba(df_test_feature_h1n1.values)


In [None]:
model_season = buildSVM(X_h1n1_vaccine[0:1000], y_h1n1_vaccine[0:1000],kernel='linear',C=0.1,gamma=0.1)
y_seasonal_predict = model_season.predict(df_test_feature_seasonal.values)
y_seasonal_predict_proba =  model_season.predict_proba(df_test_feature_seasonal.values)


In [None]:
#export
data = {'respondent_id': df_test_feature['respondent_id'],
        'h1n1_vaccine': y_h1n1_predict_proba[:,1],
        'seasonal_vaccine': y_seasonal_predict_proba[:,1]}
df = pd.DataFrame(data)
df.to_csv('output_svm.csv', index=False, float_format='%.1f', header=True)
print("export sucessfully！")

# 2.ANN

## 2.1 prediction of h1n1 vaccine

### 2.1.1 feature selection

In [None]:
#ann
all_name = df_train_feature.columns.values.tolist()  

#select 20 features
model_h1n1 = SelectKBest(chi2, k=20)
df_train_feature_h1n1 = model_h1n1.fit_transform(df_train_feature, df_train_label['h1n1_vaccine'])
select_name_index_h1n1 = model_h1n1.get_support(indices=True)  
select_name_h1n1 = []
for i in select_name_index_h1n1:
    select_name_h1n1.append(all_name[i])

# confirm label and feature    
df_test_feature_h1n1 = df_test_feature[select_name_h1n1]
X_h1n1_vaccine = df_train_feature_h1n1
y_h1n1_vaccine = df_train_label['h1n1_vaccine'].values

### 2.1.2 Hyperparameter tuning

In [None]:

from sklearn.model_selection import GridSearchCV
mlp = MLPClassifier()
param_grid = {
    'hidden_layer_sizes': [(50,20),(60,20),(70,20)],
    'activation': ['relu'],
    'solver': ['adam'],
    'max_iter': [2000]
}
grid_search = GridSearchCV(mlp, param_grid=param_grid, cv=5)
grid_search.fit(X_h1n1_vaccine[0:1000], y_h1n1_vaccine[0:1000])
print('Best Parameters of ANN model in h1n1 vaccine:', grid_search.best_params_)
print('Best Score of ANN model in h1n1 vaccine:', grid_search.best_score_)

### 2.1.3Kfold


In [None]:
kf = KFold(n_splits=5)
train_predict_score = []
test_predict_score = []

for train_index, test_index in kf.split(X_h1n1_vaccine):
    X_train, X_test = X_h1n1_vaccine[train_index], X_h1n1_vaccine[test_index]
    y_train, y_test = y_h1n1_vaccine[train_index], y_h1n1_vaccine[test_index]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    model = buildANN(X_train,y_train,activation='relu',hidden_layer_sizes=(50,20),max_iter=2000, solver= 'adam')
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)
    train_predict_score.append(model.score(X_train, y_train))
    test_predict_score.append(model.score(X_test,y_test))

#accuracy of h1n1 vaccine prediction
print("accuracy of h1n1 vaccine in train dataset is {:.2f}%".format(np.mean(train_predict_score)* 100))
print("accuracy of h1n1 vaccine in test dataset is {:.2f}%".format(np.mean(test_predict_score)* 100))

## 2.2 prediction of seasonal vaccine

### 2.2.1 feature selection

In [None]:
#ann
#4.seasonal vaccine Data Reduction
all_name = df_train_feature.columns.values.tolist()  
#select 20 feature
model_seasonal = SelectKBest(chi2, k=20)
df_train_feature_seasonal = model_seasonal.fit_transform(df_train_feature, df_train_label['seasonal_vaccine'])
select_name_index_seasonal = model_seasonal.get_support(indices=True) 
select_name_seasonal = []
for i in select_name_index_seasonal:
    select_name_seasonal.append(all_name[i])
    
df_test_feature_seasonal = df_test_feature[select_name_seasonal]
X_seasonal_vaccine = df_train_feature_seasonal
y_seasonal_vaccine = df_train_label['seasonal_vaccine'].values

### 2.2.2 Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
mlp = MLPClassifier()
param_grid = {
    'hidden_layer_sizes': [(50,20),(60,20),(70,20)],
    'activation': ['relu'],
    'solver': ['adam'],
    'max_iter': [2000]
}
grid_search = GridSearchCV(mlp, param_grid=param_grid, cv=5)
grid_search.fit(X_seasonal_vaccine[0:1000], y_seasonal_vaccine[0:1000])
print('Best Parameters of ANN model in seasonal vaccine:', grid_search.best_params_)
print('Best Score of ANN model in seasonal vaccine:', grid_search.best_score_)

### 2.2.3 Kfold

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
train_predict_score = []
test_predict_score = []
#Kfold
for train_index, test_index in kf.split(X_seasonal_vaccine):
    X_train, X_test = X_seasonal_vaccine[train_index], X_seasonal_vaccine[test_index]
    y_train, y_test = y_seasonal_vaccine[train_index], y_seasonal_vaccine[test_index]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    #modelling seasonal vacacine
    model = buildANN(X_train,y_train,activation='relu',hidden_layer_sizes=(70,20),max_iter=2000, solver= 'adam')
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)
    
    train_predict_score.append(model.score(X_train, y_train))
    test_predict_score.append(model.score(X_test,y_test))
    #print(y_test_predict_proba)
    
#accuracy of seasonal prediction
print("accuracy of seasonal vaccine in train dataset is {:.2f}%".format(np.mean(train_predict_score)* 100))
print("accuracy of seasonal vaccine in test dataset is {:.2f}%".format(np.mean(test_predict_score)* 100))

## 2.3 predict results

In [None]:
#predict results
model_h1n1 = buildANN(X_h1n1_vaccine,y_h1n1_vaccine,activation='relu',hidden_layer_sizes=(60,20),max_iter=2000, solver= 'adam')
y_h1n1_predict = model_h1n1.predict(df_test_feature_h1n1.values)
y_h1n1_predict_proba =  model_h1n1.predict_proba(df_test_feature_h1n1.values)


In [None]:
model_season = buildANN(X_seasonal_vaccine,y_seasonal_vaccine,activation='relu',hidden_layer_sizes=(60,20),max_iter=2000, solver= 'adam')
y_seasonal_predict = model_season.predict(df_test_feature_seasonal.values)
y_seasonal_predict_proba =  model_season.predict_proba(df_test_feature_seasonal.values)


In [None]:
#export
data = {'respondent_id': df_test_feature['respondent_id'],
        'h1n1_vaccine': y_h1n1_predict_proba[:,1],
        'seasonal_vaccine': y_seasonal_predict_proba[:,1]}
df = pd.DataFrame(data)
df.to_csv('output_ann.csv', index=False, float_format='%.1f', header=True)
print("export sucessfully！")