# Old code (preprocessing w/ imputation of mean/mode values)

I wll fill missing numerical values with the mean value and missing categorical values with the most frequently occurring value. 

First, I will split the features into numerical and categorical features.

In [None]:
# selects only numerical columns
X_train_numerical = X_train.select_dtypes(exclude=object)

# selects only categorical columns
X_train_categorical = X_train.select_dtypes(include=object)

Now, I will use SimpleImputer to fill the missing numerical values with the mean of the column and missing categorical values with the most frequently occurring value in the column.

In [None]:
X_train_numerical.isna().sum()

### Imputing

In [None]:
from sklearn.impute import SimpleImputer

# instantiates SimpleImputer that will fill missing values with the column mean
numerical_imputer = SimpleImputer(strategy='mean')

# fits/transforms the SimpleImputer object with the numerical training data and formats as DataFrame
X_train_numerical = pd.DataFrame(numerical_imputer.fit_transform(X_train_numerical),
                                columns = X_train_numerical.columns,
                                index = X_train_numerical.index)

# instantiates SimpleImputer that will fill missing values with most frequent column value
categorical_imputer = SimpleImputer(strategy='most_frequent')

# fits/transforms the SimpleImputer object with the categorical training data and formats as a DataFrame
X_train_categorical = pd.DataFrame(categorical_imputer.fit_transform(X_train_categorical),
                                  columns = X_train_categorical.columns,
                                  index = X_train_categorical.index)

### One-Hot Encoding

Now, I will one-hot encode the categorical columns.

In [14]:
from sklearn.preprocessing import OneHotEncoder

# instantiates OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# fits and transforms OneHotEncoder object on the categorical training data
X_train_categorical_ohe = ohe.fit_transform(X_train_categorical)

# re-formats the array as a DataFrame (in order to concatenate with numerical training data)
X_train_categorical_ohe = pd.DataFrame(X_train_categorical_ohe, 
                                       columns=ohe.get_feature_names_out(X_train_categorical.columns),
                                       index=X_train_categorical.index)

X_train_categorical_ohe

Unnamed: 0_level_0,age_group_18 - 34 Years,age_group_35 - 44 Years,age_group_45 - 54 Years,age_group_55 - 64 Years,age_group_65+ Years,education_12 Years,education_< 12 Years,education_College Graduate,education_Some College,race_Black,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25194,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14006,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11285,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2900,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19083,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5390,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
860,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15795,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Normalizing Numeric Values

Lastly, I will normalize the data in order to prevent variables with larger scales from having a disproportional impact.

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_numerical = pd.DataFrame(scaler.fit_transform(X_train_numerical),
                                index=X_train_numerical.index,
                                columns=X_train_numerical.columns)

### Concatenating Numerical and Categorical Data

Finally, I will concatenate the numerical and categorical training data into a single DataFrame.

In [16]:
X_train = pd.concat([X_train_numerical, X_train_categorical_ohe], axis=1)

X_train

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25194,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14006,0.666667,0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11285,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2900,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19083,0.666667,0.5,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,0.666667,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5390,0.333333,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.221636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
860,0.666667,0.5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15795,0.666667,0.5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Using Label Encoder (NOT FINISHED)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


# selects only numerical columns
X_train_numerical = X_train.select_dtypes(exclude=object)

# selects only categorical columns
X_train_categorical = X_train.select_dtypes(include=object)

from sklearn.impute import SimpleImputer

# instantiates SimpleImputer that will fill missing values with the column mean
numerical_imputer = SimpleImputer(strategy='mean')

# fits/transforms the SimpleImputer object with the numerical training data and formats as DataFrame
X_train_numerical = pd.DataFrame(numerical_imputer.fit_transform(X_train_numerical),
                                columns = X_train_numerical.columns,
                                index = X_train_numerical.index)

# instantiates SimpleImputer that will fill missing values with most frequent column value
categorical_imputer = SimpleImputer(strategy='most_frequent')

# fits/transforms the SimpleImputer object with the categorical training data and formats as a DataFrame
X_train_categorical = pd.DataFrame(categorical_imputer.fit_transform(X_train_categorical),
                                  columns = X_train_categorical.columns,
                                  index = X_train_categorical.index)

### Preprocessing Testing Data

In [None]:
# selects only numerical columns
X_test_numerical = X_test.select_dtypes(exclude=object)

# selects only categorical columns
X_test_categorical = X_test.select_dtypes(include=object)

# transforms the numerical testing data and formats as DataFrame
X_test_numerical = pd.DataFrame(numerical_imputer.transform(X_test_numerical),
                                columns = X_test_numerical.columns,
                                index = X_test_numerical.index)


# transforms the categorical testing data and formats as DataFrame
X_test_categorical = pd.DataFrame(categorical_imputer.transform(X_test_categorical),
                                  columns = X_test_categorical.columns,
                                  index = X_test_categorical.index)


# One-hot encodes categorical testing data 
X_test_categorical_ohe = ohe.transform(X_test_categorical)

# re-formatts the array as a DataFrame (in order to concatenate with numerical testing data)
X_test_categorical_ohe = pd.DataFrame(X_test_categorical_ohe, 
                                       columns=ohe.get_feature_names_out(X_test_categorical.columns),
                                       index=X_test_categorical.index)

X_test_numerical = pd.DataFrame(scaler.transform(X_test_numerical),
                                index=X_test_numerical.index,
                                columns=X_test_numerical.columns)

X_test = pd.concat([X_test_numerical, X_test_categorical_ohe], axis=1)

In [7]:
import pandas as pd

X = pd.read_csv('training_features', index_col='respondent_id')
y = pd.read_csv('training_labels', index_col='respondent_id')['seasonal_vaccine']

# Numerical values: SimpleImputer(strategy='mean'), MinMaxScaler()
# Categorical values: SimpleImputer(strategy='most_frequent'), OneHotEncoder()

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# selects only numerical columns
X_train_numerical = X_train.select_dtypes(exclude=object)

# selects only categorical columns
X_train_categorical = X_train.select_dtypes(include=object)

from sklearn.impute import SimpleImputer

# instantiates SimpleImputer that will fill missing values with the column mean
numerical_imputer = SimpleImputer(strategy='mean')

# fits/transforms the SimpleImputer object with the numerical training data and formats as DataFrame
X_train_numerical = pd.DataFrame(numerical_imputer.fit_transform(X_train_numerical),
                                columns = X_train_numerical.columns,
                                index = X_train_numerical.index)

# instantiates SimpleImputer that will fill missing values with most frequent column value
categorical_imputer = SimpleImputer(strategy='most_frequent')

# fits/transforms the SimpleImputer object with the categorical training data and formats as a DataFrame
X_train_categorical = pd.DataFrame(categorical_imputer.fit_transform(X_train_categorical),
                                  columns = X_train_categorical.columns,
                                  index = X_train_categorical.index)

from sklearn.preprocessing import OneHotEncoder

# instantiates OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)

# fits and transforms OneHotEncoder object on the categorical training data
X_train_categorical_ohe = ohe.fit_transform(X_train_categorical)

# re-formats the array as a DataFrame (in order to concatenate with numerical training data)
X_train_categorical_ohe = pd.DataFrame(X_train_categorical_ohe, 
                                       columns=ohe.get_feature_names_out(X_train_categorical.columns),
                                       index=X_train_categorical.index)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_numerical = pd.DataFrame(scaler.fit_transform(X_train_numerical),
                                index=X_train_numerical.index,
                                columns=X_train_numerical.columns)

X_train = pd.concat([X_train_numerical, X_train_categorical_ohe], axis=1)

X_train

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25194,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14006,0.666667,0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11285,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2900,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19083,0.666667,0.5,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,0.666667,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5390,0.333333,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.221636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
860,0.666667,0.5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15795,0.666667,0.5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# selects only numerical columns
X_test_numerical = X_test.select_dtypes(exclude=object)

# selects only categorical columns
X_test_categorical = X_test.select_dtypes(include=object)

# transforms the numerical testing data and formats as DataFrame
X_test_numerical = pd.DataFrame(numerical_imputer.transform(X_test_numerical),
                                columns = X_test_numerical.columns,
                                index = X_test_numerical.index)


# transforms the categorical testing data and formats as DataFrame
X_test_categorical = pd.DataFrame(categorical_imputer.transform(X_test_categorical),
                                  columns = X_test_categorical.columns,
                                  index = X_test_categorical.index)


# One-hot encodes categorical testing data 
X_test_categorical_ohe = ohe.transform(X_test_categorical)

# re-formatts the array as a DataFrame (in order to concatenate with numerical testing data)
X_test_categorical_ohe = pd.DataFrame(X_test_categorical_ohe, 
                                       columns=ohe.get_feature_names_out(X_test_categorical.columns),
                                       index=X_test_categorical.index)

X_test_numerical = pd.DataFrame(scaler.transform(X_test_numerical),
                                index=X_test_numerical.index,
                                columns=X_test_numerical.columns)

X_test = pd.concat([X_test_numerical, X_test_categorical_ohe], axis=1)

In [11]:
from sklearn.tree import DecisionTreeClassifier

baseline_tree = DecisionTreeClassifier(random_state=42)

baseline_tree.fit(X_train, y_train)

y_hat_test = baseline_tree.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_hat_test)

0.6937247266736558

In [12]:
y_test

respondent_id
15772    0
9407     1
16515    0
23353    0
10008    0
        ..
25990    0
14302    0
3817     1
13912    0
16392    1
Name: seasonal_vaccine, Length: 6677, dtype: int64

In [13]:
y_hat_test

array([1, 0, 1, ..., 0, 1, 1])

In [6]:
from sklearn.linear_model import LogisticRegression

baseline_logreg = LogisticRegression(fit_intercept=False, solver='liblinear', random_state=42)

baseline_logreg.fit(X_train, y_train)

y_hat_test = baseline_logreg.predict(X_test)

accuracy_score(y_test, y_hat_test)

0.7873296390594579

# Numerical values: SimpleImputer(strategy='mean')
# Categorical values: SimpleImputer(strategy='most_frequent'), LabelEncoder()
# All: MinMaxScaler()

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
# selects only numerical columns
X_train_numerical = X_train.select_dtypes(exclude=object)

# selects only categorical columns
X_train_categorical = X_train.select_dtypes(include=object)

from sklearn.impute import SimpleImputer

# instantiates SimpleImputer that will fill missing values with the column mean
numerical_imputer = SimpleImputer(strategy='mean')

# fits/transforms the SimpleImputer object with the numerical training data and formats as DataFrame
X_train_numerical = pd.DataFrame(numerical_imputer.fit_transform(X_train_numerical),
                                columns = X_train_numerical.columns,
                                index = X_train_numerical.index)

# instantiates SimpleImputer that will fill missing values with most frequent column value
categorical_imputer = SimpleImputer(strategy='most_frequent')

# fits/transforms the SimpleImputer object with the categorical training data and formats as a DataFrame
X_train_categorical = pd.DataFrame(categorical_imputer.fit_transform(X_train_categorical),
                                  columns = X_train_categorical.columns,
                                  index = X_train_categorical.index)

from sklearn.preprocessing import LabelEncoder

# instantiates LabelEncoder
le = LabelEncoder()

# fits and transforms LabelEncoder object on the categorical training data
X_train_categorical_le = le.fit_transform(X_train_categorical)

# re-formats the array as a DataFrame (in order to concatenate with numerical training data)
X_train_categorical_le = pd.DataFrame(X_train_categorical_le, 
                                       columns=X_train_categorical.columns,
                                       index=X_train_categorical.index)

X_train = pd.concat([X_train_numerical, X_train_categorical_le], axis=1)


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train),
                                index=X_train.index,
                                columns=X_train.columns)

X_train

ValueError: y should be a 1d array, got an array of shape (20030, 12) instead.