# Machine Learning:
## Predict the dominant Political Party of a state based on covid search trends

In [1]:
#dependencies
import pandas as pd

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import inspect
from sqlalchemy import create_engine, func

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Check connection to database

In [2]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [3]:
import psycopg2

In [4]:
#store password variable

from getpass import getpass
password = getpass("enter database pw")


In [5]:
# connect to database
engine = create_engine(f"postgresql://postgres:{password}@localhost:5432/CovidSearchTrends")

In [6]:
# Get the name of the tables. 
inspector = inspect(engine)
inspector.get_table_names()

['partisan_voting_index',
 'state_vaccines_ranked',
 'virus_search_terms',
 'vaccine_search_terms',
 'related_search_terms',
 'party_index',
 'presidential_results',
 'party_index_pvi',
 'vax_virus_terms',
 'vax_virus_related']

## Random Forest Classifier (Vaccine Search Terms only)

In [7]:
# get table from database and convert to dataframe
party_index_PVI = pd.read_sql_table('party_index_pvi', engine)
vaccine_search_terms = pd.read_sql_table('vaccine_search_terms', engine)


In [8]:
# join dataframes and drop extra columns 

vax_df = party_index_PVI.merge(vaccine_search_terms, 
              left_on='state_name', 
              right_on='state_name').drop(['state_name'], axis=1)
vax_df.tail(5).style.hide_index()

party_id,covid_vax_cvs,covid_vax_walgreens,vax_side_effects,covid_after_vax,vax_mandate
1,1,11,33,30,25
1,10,12,30,28,20
1,7,2,39,30,22
1,6,13,34,32,15
1,1,6,35,30,28


In [9]:
# separate features from the target
y = vax_df["party_id"]
X = vax_df.drop(["party_id"], axis=1)


# split data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)
X_train.shape

(37, 5)

In [10]:
# create Random Forest model
rf_model = RandomForestClassifier(n_estimators=500)       

# fit model w training data
rf_model.fit(X_train, y_train)

# create predicitions with test set
y_pred = rf_model.predict(X_test)

#view as dataframe
results = pd.DataFrame({"Y_Prediction": y_pred, "Actual (y-test)": y_test}).reset_index(drop=True)
results.head(5)

Unnamed: 0,Y_Prediction,Actual (y-test)
0,1,0
1,1,1
2,1,1
3,1,1
4,1,1


In [11]:
# evaluate accuracy score

print("Accuracy score from Random Forest with Vaccine Search Terms only")
vax_accuracy = accuracy_score(y_test, y_pred)
print(vax_accuracy)

Accuracy score from Random Forest with Vaccine Search Terms only
0.6923076923076923


In [12]:
# check confusion matrix

matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
matrix_df = pd.DataFrame(matrix, 
                         index=["Actual 0-D", "Actual 1-R"], 
                         columns=["Predicted 0-D", "Predicted 1-R"])

matrix_df

Unnamed: 0,Predicted 0-D,Predicted 1-R
Actual 0-D,2,3
Actual 1-R,1,7


In [13]:
#classification report 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.67      0.40      0.50         5
           1       0.70      0.88      0.78         8

    accuracy                           0.69        13
   macro avg       0.68      0.64      0.64        13
weighted avg       0.69      0.69      0.67        13



## Random Forest (Vaccine Search Terms + Virus Name Search Terms)

In [14]:
# get table from database and convert to dataframe
vax_vir_df = pd.read_sql_table('vax_virus_terms', engine)
vax_vir_df.head()

Unnamed: 0,party_id,covid_vax_cvs,covid_vax_walgreens,vax_side_effects,covid_after_vax,vax_mandate,covid,covid19,coronavirus
0,0,16,18,28,27,11,63,9,28
1,0,22,5,29,27,17,60,11,29
2,0,45,13,16,18,8,59,8,33
3,0,30,20,20,22,8,55,9,36
4,0,26,10,25,26,13,56,10,34


In [15]:
# seaparate features from the target

y = vax_vir_df["party_id"]
X = vax_vir_df.drop(["party_id"], axis=1)

# split data into testing and training sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)
X_train.shape

(37, 8)

In [16]:
# create Random Forest model
rf_model = RandomForestClassifier(n_estimators=500)       

# fit model w training data
rf_model.fit(X_train, y_train)

# create predicitions with test set
y_pred = rf_model.predict(X_test)

#view as dataframe
results = pd.DataFrame({"Y_Prediction": y_pred, "Actual (y-test)": y_test}).reset_index(drop=True)
results.head(5)

Unnamed: 0,Y_Prediction,Actual (y-test)
0,0,0
1,1,1
2,1,1
3,1,1
4,1,1


In [17]:
# evaluate accuracy score

print("Accuracy score from Random Forest with Vaccine Search Terms + Virus Search Terms")
vax_vir_accuracy = accuracy_score(y_test, y_pred)
print(vax_vir_accuracy)

Accuracy score from Random Forest with Vaccine Search Terms + Virus Search Terms
0.7692307692307693


In [18]:
# check confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
matrix_df = pd.DataFrame(matrix, 
                         index=["Actual 0-D", "Actual 1-R"], 
                         columns=["Predicted 0-D", "Predicted 1-R"])

matrix_df

Unnamed: 0,Predicted 0-D,Predicted 1-R
Actual 0-D,4,1
Actual 1-R,2,6


In [19]:
#classification report 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.86      0.75      0.80         8

    accuracy                           0.77        13
   macro avg       0.76      0.78      0.76        13
weighted avg       0.78      0.77      0.77        13



## Random Forest (Vaccine Searches + Virus Name Searches + Related Searches)

In [20]:
# get table from database and convert to dataframe
vax_vir_rel_df = pd.read_sql_table('vax_virus_related', engine)
vax_vir_rel_df.head()

Unnamed: 0,party_id,covid_vax_cvs,covid_vax_walgreens,vax_side_effects,covid_after_vax,vax_mandate,covid,covid19,coronavirus,covid_cases,lockdown,covid_symptoms,quarantine,stimulus_check
0,0,16,18,28,27,11,63,9,28,44,3,12,9,32
1,0,22,5,29,27,17,60,11,29,37,5,10,17,31
2,0,45,13,16,18,8,59,8,33,31,6,16,13,34
3,0,30,20,20,22,8,55,9,36,22,6,17,12,43
4,0,26,10,25,26,13,56,10,34,22,9,15,12,42


In [21]:
# separate features from the target

y = vax_vir_rel_df["party_id"]
X = vax_vir_rel_df.drop(["party_id"], axis=1)

# split data into testing and training sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)
X_train.shape

(37, 13)

In [22]:
# create Random Forest model
rf_model = RandomForestClassifier(n_estimators=500)       

# fit model w training data
rf_model.fit(X_train, y_train)

# create predicitions with test set
y_pred = rf_model.predict(X_test)

#view as dataframe
results = pd.DataFrame({"Y_Prediction": y_pred, "Actual (y-test)": y_test}).reset_index(drop=True)
results.head(5)

Unnamed: 0,Y_Prediction,Actual (y-test)
0,1,1
1,0,0
2,1,1
3,1,1
4,1,1


In [23]:
# evaluate accuracy score

print("Accuracy score from Random Forest with Vaccine + Virus + Related Search Terms")
vax_vir_rel_accuracy = accuracy_score(y_test, y_pred)
print(vax_vir_rel_accuracy)

Accuracy score from Random Forest with Vaccine + Virus + Related Search Terms
0.8461538461538461


In [24]:
# check confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
matrix_df = pd.DataFrame(matrix, 
                         index=["Actual 0-D", "Actual 1-R"], 
                         columns=["Predicted 0-D", "Predicted 1-R"])

matrix_df

Unnamed: 0,Predicted 0-D,Predicted 1-R
Actual 0-D,3,2
Actual 1-R,0,8


In [25]:
#classification report 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.80      1.00      0.89         8

    accuracy                           0.85        13
   macro avg       0.90      0.80      0.82        13
weighted avg       0.88      0.85      0.84        13



## Rank importance of features

In [26]:
# Calculate feature importance with feature_importances_ attribute
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.29783568906265045, 'quarantine'),
 (0.16062472052710802, 'stimulus_check'),
 (0.07207507400429762, 'vax_side_effects'),
 (0.06975998148366686, 'covid'),
 (0.06728766568343543, 'covid_after_vax'),
 (0.05570029744309798, 'covid_cases'),
 (0.05477127272756827, 'covid_vax_cvs'),
 (0.04732367703246126, 'covid_vax_walgreens'),
 (0.04121976148426199, 'covid19'),
 (0.03835108347442211, 'vax_mandate'),
 (0.03716712418876093, 'lockdown'),
 (0.030310155483432646, 'covid_symptoms'),
 (0.02757349740483644, 'coronavirus')]

## Try to improve score by adjusting search feature selection

In [27]:
# get table from database and convert to dataframe
vax_vir_rel_df = pd.read_sql_table('vax_virus_related', engine)
vax_vir_rel_df.columns

Index(['party_id', 'covid_vax_cvs', 'covid_vax_walgreens', 'vax_side_effects',
       'covid_after_vax', 'vax_mandate', 'covid', 'covid19', 'coronavirus',
       'covid_cases', 'lockdown', 'covid_symptoms', 'quarantine',
       'stimulus_check'],
      dtype='object')

In [28]:
# separate features from the target

y = vax_vir_rel_df["party_id"]
X = vax_vir_rel_df.drop(["party_id", "covid_symptoms", "vax_mandate", "lockdown"], axis=1)

# split data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)
# create Random Forest model
rf_model = RandomForestClassifier(n_estimators=500)       

# fit model w training data
rf_model.fit(X_train, y_train)

# create predicitions with test set
y_pred = rf_model.predict(X_test)

# evaluate accuracy score
print("Accuracy score from Random Forest with top 10 features")
vax_vir_rel_top10 = accuracy_score(y_test, y_pred)
print(vax_vir_rel_top10)

Accuracy score from Random Forest with top 10 features
1.0


In [29]:
# check confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
matrix_df = pd.DataFrame(matrix, 
                         index=["Actual 0-D", "Actual 1-R"], 
                         columns=["Predicted 0-D", "Predicted 1-R"])

matrix_df

Unnamed: 0,Predicted 0-D,Predicted 1-R
Actual 0-D,5,0
Actual 1-R,0,8


In [30]:
#classification report 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         8

    accuracy                           1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13



In [31]:
#summary of accuracy scores by feature selection
scores = [vax_accuracy, vax_vir_accuracy, vax_vir_rel_accuracy, vax_vir_rel_top10 ]
features = ["Vax", "Vax + Virus", "Vax + Virus + ALLRelated", "Vax + Virus + Rel - Top 10"]

feature_scores_df = pd.DataFrame()

feature_scores_df['random_forest_features'] = features
feature_scores_df['accuracy_scores'] = scores
feature_scores_df.round(decimals=2)

Unnamed: 0,random_forest_features,accuracy_scores
0,Vax,0.69
1,Vax + Virus,0.77
2,Vax + Virus + ALLRelated,0.85
3,Vax + Virus + Rel - Top 10,1.0
