In [4]:
#import sys
!pip install scikit-multilearn



In [8]:
# Pre-processing
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
#from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

# file names
file_train = "training_set_features.csv"
file_labels = "training_set_labels.csv"
file_test = "test_set_features.csv"

# datasets
#data_labels = pd.read_csv(file_labels).set_index('respondent_id')
data_train = pd.read_csv(file_train).set_index('respondent_id')
#data_test = pd.read_csv(file_test).set_index('respondent_id')

#print("Number of rows in data:",data_train.shape[0])
#print("Number of features in data:",data_train.shape[1])
#print("\n")
#print("**Sample Data**")
data_train.head(n=5)

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [None]:
## Exploring the number of times each label appears in the data

categories = list(data_labels.columns[0:].values)
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))

ax= sns.barplot(categories, data_labels.iloc[:,0:].sum().values)
plt.title("H1N1 Vaccine and Seasonal Vaccine", fontsize=24)
plt.ylabel('Number in each category', fontsize=18)
plt.xlabel('Vaccine Type ', fontsize=18)

#adding the text labels
rects = ax.patches
labels = data_labels.iloc[:,1:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
plt.show()

In [9]:
## Creating new column for variables that have more than 400 missing values
numer_of_null = data_train.isnull().sum()
names = list(numer_of_null.index)
to_take_out = [] # it's name to_take_out, but I actually don't take out any variables

## getting the names of all columns with more than 400 null values
for element in names:
    if numer_of_null[element] >=400:
        to_take_out.append(element)
        #print("Variable name:",element)                 # useful just to get a sense of which variables
        #print("Number of NA's",numer_of_null[element])
        
# for these values, create new column showing that it is null        
for element in to_take_out:
    name_column = 'is_null_'+ element
    data_train[name_column] = np.where(data_train[element].isnull(), 1, 0)   
    #data_test[name_column] = np.where(data_test[element].isnull(), 1, 0)

In [11]:
## setting up the imputer
imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

## testing the imputer with training dataset
var_names = list(data_train)
imp.fit(data_train[var_names])
data_train[var_names] = imp.transform(data_train[var_names])
data_train = pd.DataFrame(data_train)


## testing the imputer with test dataset
#var_names = list(data_test)
#imp.fit(data_test[var_names])
#data_test[var_names]= imp.transform(data_test[var_names])
#data_test = pd.DataFrame(data_test)

## checking with variables have null values
#data_train.isnull().sum()
#data_test.isnull().sum()

In [12]:
## on this chunk need to conver categorical variables into one-hot vectors

cat_variables = ['age_group','education','race','income_poverty','marital_status',
                'rent_or_own','employment_status','hhs_geo_region','census_msa',
                 'household_children','employment_industry','employment_occupation','sex']


data_train = pd.get_dummies(data_train, columns = cat_variables)
#data_test = pd.get_dummies(data_test, columns = cat_variables)

In [7]:
## Here I change the names before training a classifier
X_train = data_train
Y_train = data_labels[['h1n1_vaccine','seasonal_vaccine']]
X_test = data_test

In [9]:
# Initialize Binary Relevance multi-label classifier
# with an SVM classifier
# SVM in scikit only supports the X matrix in sparse representation

classifier = BinaryRelevance(
    classifier = SVC(probability=True),
    require_dense = [False, True]
)

classifier.fit(X_train, Y_train)

BinaryRelevance(classifier=SVC(C=1.0, break_ties=False, cache_size=200,
                               class_weight=None, coef0=0.0,
                               decision_function_shape='ovr', degree=3,
                               gamma='scale', kernel='rbf', max_iter=-1,
                               probability=False, random_state=None,
                               shrinking=True, tol=0.001, verbose=False),
                require_dense=[False, True])

In [None]:
## This portion stores probabilities as a pandas dataframe and
## saves it as a csv

#Getting probabilities from test set
probabilities = classifier.predict_proba(X_test)

#Adding the ids
probabilities_with_ids = pd.DataFrame(data = probabilities.toarray()[0:,0:])
probabilities_with_ids.index = X_test.index #gets the ids from the test set
probabilities_with_ids = probabilities_with_ids.rename(
    columns = {0:'h1n1_vaccine',1:'seasonal_vaccine'})

#Saving to csv
probabilities_with_ids.to_csv('probabilities.csv')

In [55]:
#clf = tree.DecisionTreeClassifier(min_samples_leaf=16)
#clf.fit(X_train, Y_train)
#Y_predicted = clf.predict(X_test)


# using Label Powerset
#from skmultilearn.problem_transform import LabelPowerset
# initialize label powerset multi-label classifier
#classifier = LabelPowerset(LogisticRegression(max_iter=2000))
# train
#classifier.fit(X_train, Y_train)
# predict
#predictions = classifier.predict(X_test)

In [8]:
X_test

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv,sex_Female,sex_Male
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,1
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,1
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,1,0
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26703,53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,1,0,1,0
26704,53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0,0,0,1,0,0,0,0,0,1
26705,53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,1,0
26706,53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,1,0


In [14]:
data_train.to_csv('training_data_onehot.csv')