# Machine Learning:
## Predict the dominant Political Party of a state based on covid vaccine search trends

In [1]:
#dependencies
import matplotlib.pyplot as plt
import pandas as pd

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import inspect
from sqlalchemy import create_engine, func


## Check connection to database

In [2]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [3]:
import psycopg2

In [4]:
#store password variable
from getpass import getpass
password = getpass("enter database pw")

enter database pw········


In [5]:
# connect to database
engine = create_engine(f"postgresql://postgres:{password}@localhost:5432/CovidSearchTrends")

In [6]:
# Get the name of the tables. 
inspector = inspect(engine)
inspector.get_table_names()

['presidential_results',
 'partisan_voting_index',
 'state_vaccines_ranked',
 'virus_search_terms',
 'vaccine_search_terms',
 'related_search_terms',
 'party_index']

## Logistic Regression (*without* state rank from PVI)

In [7]:
# get table from database and convert to dataframe
PVI = pd.read_sql_table('partisan_voting_index', engine)
party_index = pd.read_sql_table('party_index', engine)
vaccine_search_terms = pd.read_sql_table('vaccine_search_terms', engine)


In [8]:
# join all 3 dataframes and drop extra columns 
df = PVI.merge(party_index, 
                  left_on='party', 
                  right_on='party').drop(['party'], axis=1)

df = df.merge(vaccine_search_terms, 
              left_on='state_name', 
              right_on='state_name').drop(['state_name'], axis=1)
df.tail(5).style.hide_index()

state_rank,party_id,covid_vax_cvs,covid_vax_walgreens,vax_side_effects,covid_after_vax,vax_mandate
45,1,1,11,33,30,25
46,1,10,12,30,28,20
47,1,7,2,39,30,22
48,1,6,13,34,32,15
49,1,1,6,35,30,28


In [9]:
# seaparate features from the target

y = df["party_id"]
X = df.drop(["party_id", "state_rank"], axis=1)

In [10]:
# split data into testing and training sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)

X_train.shape

(37, 5)

In [11]:
# create logistic regression model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200)                             

In [12]:
# fit model w training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200)

In [13]:
# create predicitions with test set
y_pred = classifier.predict(X_test)

#view as dataframe
results = pd.DataFrame({"Y_Prediction": y_pred, "Actual (y-test)": y_test}).reset_index(drop=True)
results.head(5)

Unnamed: 0,Y_Prediction,Actual (y-test)
0,1,1
1,0,0
2,0,1
3,0,0
4,0,1


In [14]:
# evaluate accuracy score

from sklearn.metrics import accuracy_score
print("Accuracy score from Logistic Regression WITHOUT state rank")
print(accuracy_score(y_test, y_pred))

Accuracy score from Logistic Regression WITHOUT state rank
0.6153846153846154


In [15]:
# check confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
matrix_df = pd.DataFrame(matrix, 
                         index=["Actual 0-D", "Actual 1-R"], 
                         columns=["Predicted 0-D", "Predicted 1-R"])

matrix_df

Unnamed: 0,Predicted 0-D,Predicted 1-R
Actual 0-D,2,3
Actual 1-R,2,6


In [16]:
#classification report 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.50      0.40      0.44         5
           1       0.67      0.75      0.71         8

    accuracy                           0.62        13
   macro avg       0.58      0.57      0.58        13
weighted avg       0.60      0.62      0.61        13



## Logistic Regression (*with* state rank from PVI)

In [17]:
# seaparate features from the target

y = df["party_id"]
X = df.drop(["party_id"], axis=1)

In [18]:
# split data into testing and training sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)

X_train.shape

(37, 6)

In [19]:
# create logistic regression model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200)                             

In [20]:
# fit model w training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200)

In [21]:
# create predicitions with test set
y_pred = classifier.predict(X_test)

#view as dataframe
results = pd.DataFrame({"Y_Prediction": y_pred, "Actual (y-test)": y_test}).reset_index(drop=True)
results.head(5)

Unnamed: 0,Y_Prediction,Actual (y-test)
0,1,1
1,0,0
2,0,0
3,0,0
4,1,1


In [22]:
# evaluate accuracy score

from sklearn.metrics import accuracy_score
print("Accuracy score from Logistic Regression WITH state rank")
print(accuracy_score(y_test, y_pred))


Accuracy score from Logistic Regression WITH state rank
0.9230769230769231


In [23]:
# check confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
matrix_df = pd.DataFrame(matrix, 
                         index=["Actual 0-D", "Actual 1-R"], 
                         columns=["Predicted 0-D", "Predicted 1-R"])


matrix_df

Unnamed: 0,Predicted 0-D,Predicted 1-R
Actual 0-D,5,0
Actual 1-R,1,7


In [24]:
#classification report 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       1.00      0.88      0.93         8

    accuracy                           0.92        13
   macro avg       0.92      0.94      0.92        13
weighted avg       0.94      0.92      0.92        13

