In [348]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vaccine-data/training_set_features.csv
/kaggle/input/vaccine-data/test_set_features.csv
/kaggle/input/vaccine-data/training_set_labels.csv
/kaggle/input/vaccine-data/submission_format.csv


In [349]:
train_features=pd.read_csv('/kaggle/input/vaccine-data/training_set_features.csv')
test_features=pd.read_csv('/kaggle/input/vaccine-data/test_set_features.csv')
train_labels=pd.read_csv('/kaggle/input/vaccine-data/training_set_labels.csv')
submission=pd.read_csv('/kaggle/input/vaccine-data/submission_format.csv')

In [350]:
train_features.shape

(26707, 36)

In [351]:
train_labels.shape

(26707, 3)

In [352]:
train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [353]:
train_features.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [354]:
train_features.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [355]:
train_features=train_features.drop(['hhs_geo_region','census_msa','rent_or_own','race','marital_status','employment_status','employment_occupation','employment_industry','education'],axis=1)
train_features.shape
test_features=test_features.drop(['hhs_geo_region','census_msa','rent_or_own','race','marital_status','employment_status','employment_occupation','employment_industry','education'],axis=1)

In [356]:
train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [357]:
train_features

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,sex,income_poverty,household_adults,household_children
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,Female,Below Poverty,0.0,0.0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,Male,Below Poverty,0.0,0.0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,Male,"<= $75,000, Above Poverty",2.0,0.0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,3.0,5.0,5.0,4.0,1.0,65+ Years,Female,Below Poverty,0.0,0.0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Female,"<= $75,000, Above Poverty",1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,5.0,2.0,2.0,65+ Years,Female,"<= $75,000, Above Poverty",0.0,0.0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,2.0,2.0,5.0,1.0,1.0,18 - 34 Years,Male,"<= $75,000, Above Poverty",1.0,0.0
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,4.0,2.0,5.0,4.0,2.0,55 - 64 Years,Female,,0.0,0.0
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,1.0,2.0,2.0,1.0,2.0,18 - 34 Years,Female,"<= $75,000, Above Poverty",1.0,0.0


In [358]:
ohe=OneHotEncoder(sparse=False,dtype=np.int32)

In [None]:
train_features_new=ohe.fit_transform(train_features[['sex']])
encoded_train = pd.DataFrame(train_features_new, columns=ohe.get_feature_names_out(['sex']))
train_features = pd.concat([train_features, encoded_train], axis=1)


test_features_new=ohe.fit_transform(test_features[['sex']])
encoded_test = pd.DataFrame(test_features_new, columns=ohe.get_feature_names_out(['sex']))
test_features = pd.concat([test_features, encoded_test], axis=1)


In [None]:
category_order = ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years','55 - 64 Years','65+ Years']
encoder = OrdinalEncoder(categories=[category_order])
train_features['age_group_encoded'] = encoder.fit_transform(train_features[['age_group']])
train_features.drop(columns=['age_group'], inplace=True)
train_features['age_group_encoded']

In [None]:
encoder = OrdinalEncoder()
train_features['income_poverty_encoded'] = encoder.fit_transform(train_features[['income_poverty']])
train_features.drop(columns=['income_poverty'],inplace=True)

In [None]:
train_features

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_features = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)

In [None]:
train_features=train_features.drop(['sex'],axis=1)

In [None]:
# imputer = SimpleImputer(strategy='most_frequent')
# train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
# test_features = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)

In [None]:
train_features

In [None]:
log_reg=LogisticRegression()

In [None]:
print(train_features.shape)
print(train_labels.shape)

In [None]:
X_train=np.asarray(train_features)
y_train=np.asarray(train_labels)
x_test=np.asarray(test_features)

In [None]:
y_train=y_train.ravel()

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
log_reg.fit(X_train,y_train)