In [None]:
#load modules
import pandas as pd #data-processing
import numpy as np #linear algebra
import matplotlib as mpl
import matplotlib.pyplot as plt #visualization
import seaborn as sns #visualization
import plotly.express as px #mapping
import json #json objects
import os #file system

#MODELS
from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import RandomForestRegressor

from sklearn.cluster import KMeans #cluster algorithm
from sklearn.decomposition import PCA #combine attributes

from sklearn.preprocessing import MinMaxScaler, StandardScaler #OneHotEncoder LabelEncoder
from sklearn.model_selection import train_test_split #module to define test/train data

from sklearn.metrics import silhouette_score, r2_score, mean_squared_error, mean_absolute_error #forest/trees/linreg/XGB performance
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report #clf performance
#from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, auc, precision_recall_curve #clf performance

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 11
DATA_PATH = os.path.join('datasets')                   #target local directory path

def load_dataset(filename,data_path = DATA_PATH):      #READ LOCAL CSV DATASET
    csvPath = os.path.join(data_path, filename)
    return pd.read_csv(csvPath, header = 0, encoding = 'UTF-8') #return as panda dataframe

def export_dataset(df,filename,data_path = DATA_PATH):
    export_path = os.path.join(data_path, filename)
    df.to_csv(export_path)
    print('Successful export to',export_path)

In [None]:
jsonPath = os.path.join(DATA_PATH, 'geojson-counties-fips.json') #load FIPS-county polygons for mapping
with open(jsonPath) as jsonFile:
    counties = json.load(jsonFile)

rawElect = load_dataset('countypres_2000-2020.csv')
rawElect.loc[rawElect.state_po == 'DC', 'county_fips'] = 11001 #fix missing FIPS for District of Columbia
rawElect.dropna(subset = ['county_fips', 'candidatevotes'], inplace = True)
rawElect['county_fips'] = rawElect['county_fips'].apply(np.int32)
rawElect.info()

In [None]:
years = rawElect.year.unique()
fips = rawElect.county_fips.unique()
rawElect.rename(columns = {'county_fips':'FIPS'}, inplace = True)
rawElect.set_index('FIPS', inplace = True)
print('\nElection Years:',years)
print('Number of Counties:',len(fips))
rawElect.head(15)

In [None]:
electExt = pd.DataFrame()
for y in years: #create a series for each election year; rows = counties
    Dsubset = rawElect[(rawElect['year'] == y) & (rawElect['party'] == 'DEMOCRAT')]
    Dsubset = Dsubset.groupby(Dsubset.index).agg({'candidatevotes':sum}) #aggregate composite tallies
    Dseries = pd.Series(Dsubset['candidatevotes'], index = Dsubset.index)
    Rsubset = rawElect[(rawElect['year'] == y) & (rawElect['party'] == 'REPUBLICAN')]
    Rsubset = Rsubset.groupby(Rsubset.index).agg({'candidatevotes':sum})
    Rseries = pd.Series(Rsubset['candidatevotes'], index = Rsubset.index)
    Tseries = Rseries + Dseries #adj TOTal
    Pseries = Dseries / Tseries #percent DEM of adj TOTal
    Pseries = Pseries.to_frame(y).add_prefix('perDEM_')   # percent DEM of adj Total
    Tseries = Tseries.to_frame(y).add_prefix('TOT_')      # adj TOTal (DEM + GOP)
    Dseries = Dseries.to_frame(y).add_prefix('DEM_')      # DEM votes
    Rseries = Rseries.to_frame(y).add_prefix('GOP_')      # GOP votes
    electExt = electExt.join((Pseries, Dseries, Rseries, Tseries), how = 'outer')   
Ssubset = rawElect[(rawElect['year'] == 2000) & (rawElect['party'] == 'REPUBLICAN')]
Sseries = pd.Series(Ssubset['state'])
Sseries = Sseries.to_frame()                              # State names
electExt = Sseries.join(electExt, how = 'outer')
electExt.dropna(inplace = True)
electExt = electExt[~electExt.index.duplicated(keep = 'last')]
print(electExt.info())
electExt.head()

In [None]:
StTOT_2020 = electExt.groupby(['state']).TOT_2020.sum()                     #total 20 vote count by state
StDEM_2020 = electExt.groupby(['state']).DEM_2020.sum()                     #total 20 DEM vote count by state
StPerDEM_2020 = (StDEM_2020 / StTOT_2020).round(decimals = 4).sort_values() #percent DEM 20 by state

colors = []
red = [] #solid GOP
blue = [] #solid DEM
swing = [] #battleground/purple (within 5% of 50/50)
for st in StPerDEM_2020.index:
    if StPerDEM_2020[st] > 0.55:
        colors.append('b')
        blue.append(st)
    elif StPerDEM_2020[st] < 0.45:
        colors.append('r')
        red.append(st)
    else:
        colors.append('m')
        swing.append(st)
ax = (StPerDEM_2020 * 100).plot.barh(figsize = (10,12), title = 'Percent DEM 20 Vote by State', color = colors)
plt.axvline(x = 50,color = 'k') #the tipping point at 50%
plt.show()

swing # ordered GOP-leaning to DEM-leaning

Hmmm... so from 2016 to 2020, Colorado and New Mexico shifted from purple to blue, and Iowa from red to purple.  No state shifts to red.<br>

To avoid dealing with redistricting (and the resulting mismatch with datasets), we will focus battleground/purple swing states with statewide US Senate and Gubernatorial races in 2022 and the changes in demographics in those states taken from census data.

Of <font color = 'purple'>(purple)</font> swing states,<br>the following states' US senators up for reelection in 2022:<br>
<font color = 'red'>Iowa: Chuck Grassley (GOP)<br></font>
<font color = 'red'>Ohio: Rob Portman (GOP) [NOT RUNNING FOR REELECTION]<br></font>
<font color = 'red'>Florida: Marco Rubio (GOP)<br></font>
<font color = 'red'>North Carolina: Richard Burr (GOP) [NOT RUNNING FOR REELECTION]<br></font>
<font color = 'blue'>Georgia: Raphael Warnock (DEM)<br></font>
<font color = 'red'>Arizona: John Boozman (GOP)<br></font>
<font color = 'red'>Wisconsin: Ron Johnson (GOP)<br></font>
<font color = 'red'>Pennsylvania: Patrick Toomey (GOP) [NOT RUNNING FOR REELECTION]<br></font>
<font color = 'blue'>Nevada: Catherine Cortez Masto (DEM)<br></font>
<font color = 'blue'>New Hampshire: Hassan, Margaret Wood (DEM)<br></font>

Governors up for reelection in 2022:<br>
<font color = 'red'>Iowa: Kim Reynolds (GOP)<br></font>
<font color = 'red'>Ohio: Mike DeWine (GOP)<br></font>
<font color = 'red'>Texas: Greg Abbott (GOP)<br></font>
<font color = 'red'>Florida: Ron DeSantis (GOP)<br></font>
<font color = 'red'>Georgia: Brian Kemp (GOP)<br></font>
<font color = 'red'>Arizona: Doug Ducey (GOP<br></font>
<font color = 'blue'>Wisconsin: Tony Evers (DEM)<br></font>
<font color = 'blue'>Pennsylvania: Tom Wolf (DEM)<br></font>
<font color = 'blue'>Michigan: Gretchen Whitmer (DEM)<br></font>
<font color = 'blue'>Minnesota: Tim Walz (DEM)<br></font>
<font color = 'red'>New Hampshire: Chris Sununu (GOP)<br></font>
<font color = 'blue'>Maine: Janet Mills (DEM)<br></font>

In [None]:
rawCensus = load_dataset('county_complete.csv') #load census from local file
print('raw census:  ',rawCensus.shape) # Dimensions of the dataset
rawCensus.rename(columns = {'fips':'FIPS'}, inplace = True) #match FIPS to json feature ID
dropRowsCensus = rawCensus[~rawCensus.FIPS.isin(electExt.index)].index #match fips indices census : elect
rawCensus.drop(dropRowsCensus, inplace = True)
rawCensus.set_index('FIPS', inplace = True) #match indexes for merge
rawCensus.state = rawCensus.state.str.upper()
swingElect = electExt[electExt.state.isin(swing)] #elect for swing states
#swingElect.index = int(swingElect.index) #drop trailing '.0'
swingCensus = rawCensus[rawCensus.state.isin(swing)] #census for swing states
swingElect.index = swingElect.index.map(str).str.zfill(5) #add leading zero to FIPS to match JSON feature ID
swingCensus.index = swingCensus.index.map(str).str.zfill(5) 
print('swing elect: ',swingElect.shape)
print('swing census:',swingCensus.shape)
print('\ncensus features:\n')
for col in swingCensus.columns: #iterate through col headers because too many
    print(col)
swingCensus.head()

In [None]:
swingElect.head()

In [None]:
change = swingElect.filter(regex = 'perDEM_').copy()
change.columns = change.columns.str.replace('perDEM_','')
changeYears = [int(i) for i in list(change.columns)]
changeYears.remove(2000)
change['shift'] = 100 * (change['2020'] - change['2000'])
print(changeYears) #we will use this later to animate yearly change in voter splits (DEM-GOP)

In [None]:
pop = swingCensus.filter(regex = 'pop').copy()
pop.columns = pop.columns.str.replace('pop','')
pop.columns = pop.columns.str.replace('_','')
series2022 = pd.Series()
series2020 = pd.Series()
for eachCounty in pop.index: #FIPS
    catSeries = pop.loc[pop.index == eachCounty] #series of each FIPS's known data 
    catSeries = catSeries.T #transpose series
    catSeries.dropna(inplace = True)
    catSeries = catSeries.reset_index().rename(columns = {eachCounty:'cat','index':'year'})
    X = np.c_[catSeries['year']]
    y = np.c_[catSeries['cat']]
    linRegMod = LinearRegression()
    linRegMod.fit(X, y)
    predict2022 = float(linRegMod.coef_[0][0] * 2022 + linRegMod.intercept_[0])
    predict2020 = float(linRegMod.coef_[0][0] * 2020 + linRegMod.intercept_[0])
    each2022 = pd.Series([predict2022], index = [eachCounty]) #(FIPS Index, PredCatValue)
    each2020 = pd.Series([predict2020], index = [eachCounty])
    series2022 = series2022.append(each2022)
    series2020 = series2020.append(each2020)
df2022 = pd.DataFrame([series2022]).T
df2022 = df2022.rename(columns = {0:'2022'})
df2020 = pd.DataFrame([series2020]).T
df2020 = df2020.rename(columns = {0:'2020'})
pop = pop.join((df2020,df2022))
pop = pop.sort_index(axis = 1)
pop = pop.apply(np.int32)
pop.head()

In [None]:
def predictWizard(catDf,predYears): #USE WHEN NEED MULTIPLE YEARS WITHOUT EVALUATING AGAINST POPULATION
    seriesPredYearList = []
    for predYear in predYears: # list of years instantiates list of series for each year
        seriesPredYear = pd.Series() # new series
        seriesPredYearList.append(seriesPredYear) # add to list of series
    for eachCounty in catDf.index: # each county based on FIPS index
        catSeries = catDf.loc[catDf.index == eachCounty] #cat data at FIPS index
        catSeries = catSeries.T # transpose series
        catSeries.dropna(inplace = True) # preprocess each year for NA -- drop empty value years
        catSeries = catSeries.reset_index().rename(columns = {eachCounty:'cat','index':'year'})
        catSeries['year'] = catSeries['year'].astype(int) #drop decimals
        X = np.c_[catSeries['year']] #use series of years' category data as FEATURE/INDEPENDENT VARIABLE...
        y = np.c_[catSeries['cat']] #to train model on series category data
        #poly = PolynomialFeatures(degree = 3)                      ###
        #X_poly = poly.fit_transform(X)                             ### 
        linRegMod = LinearRegression() 
        #linRegMod.fit(X_poly, y) # train one model for each county ###
        linRegMod.fit(X, y) #train model on each FIPS' category data for ALL YEARS
        i = 0 #index of yearly series (building incomplete yearly)
        for predYear in predYears: #predict category value for each requested year using trained model
            predYear = int(predYear) #convert any strings years to ints
            #predicted = float(linRegMod.predict(poly.fit_transform([[predYear]])))
            predicted = float(linRegMod.coef_[0][0] * predYear + linRegMod.intercept_[0]) #float, can change later
            catSeries.loc[len(catSeries.index)] = [predYear, predicted] #TEST OUTPUT REMOVE add to series of years (per FIP) to verify
            #eachPrediction = pd.Series([predicted], index = [eachCounty]) #create single row series for each prediction
            #seriesPredYearList[i] = seriesPredYearList[i].append(eachPrediction) #add prediction to FIPS series
            seriesPredYearList[i] = seriesPredYearList[i].append(pd.Series([predicted], index = [eachCounty])) #add prediction to FIPS series (of year series)
            i += 1
    i = 0 #index of yearly series (complete yearly)
    for predYear in predYears:
        dfPredYear = pd.DataFrame([seriesPredYearList[i]]).T #RE-Transpose series
        dfPredYear = dfPredYear.rename(columns = {0 : predYear}) # Add each series to dataframe of created data
        i += 1
        catDf = catDf.join(dfPredYear) #add created data dataframe to passed dataframe to return
    catDf.columns = catDf.columns.astype(str)
    catDf = catDf.sort_index(axis = 1)
    return catDf

def predictWizPop(catDf,predYears,pop): #USE FOR MULTIPLE PREDICTIONS EVALUATING DEMOGRAPHICS AS % OF POPULATION
    getTheseYears = catDf.T.index #get years to filter pop
    popFiltered = pop.reindex(columns = getTheseYears, fill_value = 0) #pop only of same years in catDf
    catCount = catDf * popFiltered #convert percentages to count
    seriesPredYearList = []
    for predYear in predYears:
        seriesPredYear = pd.Series()
        seriesPredYearList.append(seriesPredYear)
    for eachCounty in catCount.index:
        catSeries = catCount.loc[catCount.index == eachCounty] #cat data at FIPS index
        catSeries = catSeries.T #transpose series
        catSeries.dropna(inplace = True)
        catSeries = catSeries.reset_index().rename(columns = {eachCounty:'cat','index':'year'})
        catSeries['year'] = catSeries['year'].astype(int)
        X = np.c_[catSeries['year']]
        y = np.c_[catSeries['cat']]
        #poly = PolynomialFeatures(degree = 3)                                    ###
        #X_poly = poly.fit_transform(X)                                           ### 
        linRegMod = LinearRegression() 
        #linRegMod.fit(X_poly, y) # train one model for each county for all years ###
        linRegMod.fit(X, y) #train each county's (FIPS) model for ALL YEARS
        i = 0
        for predYear in predYears:
            popPredYear = pop.at[eachCounty, str(predYear)]
            predYear = int(predYear) #convert any strings years to ints
            #predicted = float(linRegMod.predict(poly.fit_transform([[predYear]])))
            predicted = float(linRegMod.coef_[0][0] * predYear + linRegMod.intercept_[0]) #float, can change later
            predicted = predicted / popPredYear #convert predicted count back to percentage based on pop that year
            if predicted > 100:
                predicted = 100
            if predicted < 0:
                predicted = 0
            catSeries.loc[len(catSeries.index)] = [predYear, predicted] #add to series of years (per FIP) to verify
            eachPrediction = pd.Series([predicted], index = [eachCounty]) #create series for each prediction
            seriesPredYearList[i] = seriesPredYearList[i].append(eachPrediction) #add prediction to FIPS series
            i += 1
    i = 0
    for predYear in predYears:
        dfPredYear = pd.DataFrame([seriesPredYearList[i]]).T #RE-Transpose series
        dfPredYear = dfPredYear.rename(columns = {0 : predYear}) # Add each series to dataframe of created data
        i += 1
        catDf = catDf.join(dfPredYear) #add created data dataframe to passed dataframe to return
    catDf.columns = catDf.columns.astype(str)
    catDf = catDf.round(1)
    catDf = catDf.sort_index(axis = 1)
    return catDf

In [None]:
pop = pop.sort_index(axis = 1)
pop = pop.apply(np.int64)
print('population')
print(pop.info()) #don't add "pop_" prefix yet* -- use annual population data to evaluate demographic %

income = swingCensus.filter(regex = 'median_household_income_').copy()
income.columns = income.columns.str.replace('median_household_income_','')
income = predictWizard(income,(2022,2020))
income = income.add_prefix('median_household_income_')
income = income.apply(np.int64)
print('\nincome')
print(income.info())

per_capita_income = swingCensus.filter(regex = 'per_capita_income_').copy()
per_capita_income.columns = per_capita_income.columns.str.replace('per_capita_income_','')
per_capita_income = predictWizard(per_capita_income,(2022,2020,2016))
per_capita_income = per_capita_income.add_prefix('per_capita_income_')
per_capita_income = per_capita_income.apply(np.int64)
print('\nper capita income')
print(per_capita_income.info())

white = swingCensus[['white_2010','white_2019']].copy()
white.columns = white.columns.str.replace('white_','')
white = predictWizPop(white,(2022,2020,2016),pop)
white = white.add_prefix('white_')
print('\nwhite')
print(white.info())

white_not_hispanic = swingCensus.filter(regex = 'white_not_hispanic_').copy()
white_not_hispanic.columns = white_not_hispanic.columns.str.replace('white_not_hispanic_','')
white_not_hispanic = predictWizPop(white_not_hispanic,(2022,2020,2016),pop)
white_not_hispanic = white_not_hispanic.add_prefix('white_not_hispanic_')
print('\nwhite not hispanic')
print(white_not_hispanic.info())

black = swingCensus[['black_2010','black_2017','black_2019']].copy()
black.columns=black.columns.str.replace('black_','')
black['2010'].fillna(black['2017'], inplace = True)
black = predictWizPop(black,(2022,2020,2016),pop)
black = black.add_prefix('black_')
print('\nblack')
print(black.info())

asian = swingCensus[['asian_2010','asian_2017','asian_2019']].copy()
asian.columns = asian.columns.str.replace('asian_','')
asian['2010'].fillna(asian['2017'], inplace = True)
asian = predictWizPop(asian,(2022,2020,2016),pop)
asian = asian.add_prefix('asian_')
print('\nasian')
print(asian.info())

hispanic = swingCensus[['hispanic_2010','hispanic_2017','hispanic_2019']].copy()
hispanic.columns = hispanic.columns.str.replace('hispanic_','')
hispanic = predictWizPop(hispanic,(2022,2020,2016),pop)
hispanic = hispanic.add_prefix('hispanic_')
print('\nhispanic')
print(hispanic.info())

two_plus_races = swingCensus.filter(regex = 'two_plus_races_').copy()
two_plus_races.columns = two_plus_races.columns.str.replace('two_plus_races_','')
two_plus_races = predictWizPop(two_plus_races,(2022,2020,2016),pop)
two_plus_races = two_plus_races.add_prefix('two_plus_races_')
print('\ntwo plus races')
print(two_plus_races.info())

hs_grad = swingCensus.filter(regex = 'hs_grad_').copy()
hs_grad.columns = hs_grad.columns.str.replace('hs_grad_','')
hs_grad = predictWizPop(hs_grad,(2022,2020),pop)
hs_grad = hs_grad.add_prefix('hs_grad_')
print('\nHS grad')
print(hs_grad.info())

bachelors = swingCensus.filter(regex = 'bachelors_').copy()
bachelors.columns = bachelors.columns.str.replace('bachelors_','')
bachelors = predictWizPop(bachelors,(2022,2020),pop)
bachelors = bachelors.add_prefix('bachelors_')
print('\nbachelors')
print(bachelors.info())

civilian_labor_force = swingCensus.filter(regex = 'civilian_labor_force_').copy()
civilian_labor_force.columns = civilian_labor_force.columns.str.replace('civilian_labor_force_','')
civilian_labor_force = predictWizard(civilian_labor_force,(2022,2020))
civilian_labor_force = civilian_labor_force.add_prefix('civilian_labor_force_')
civilian_labor_force = civilian_labor_force.apply(np.int64)
print('\ncivilian labor force')
print(civilian_labor_force.info())

poverty = swingCensus[['poverty_2010','poverty_2016','poverty_2017','poverty_2019']].copy()
poverty.columns = poverty.columns.str.replace('poverty_','')
poverty['2019'].fillna(poverty['2017'], inplace = True)
poverty = predictWizard(poverty,(2022,2020))
poverty = poverty.add_prefix('poverty_')
poverty = poverty.round(1)
print('\npoverty')
print(poverty.info())

unemployment = swingCensus.filter(regex = 'unemployment_rate_').copy()
unemployment.columns = unemployment.columns.str.replace('unemployment_rate_','')
unemployment = predictWizPop(unemployment,(2022,2020),pop)
unemployment = unemployment.add_prefix('unemployment_rate_')
unemployment = unemployment.round(1)
print('\nunemployment')
print(unemployment.info())

median_age = swingCensus.filter(regex='median_age_').copy()
median_age.columns = median_age.columns.str.replace('median_age_','')
median_age = predictWizard(median_age,(2022,2020,2016))
median_age = median_age.add_prefix('median_age_')
median_age = median_age.round(1)
print('\nmedian age')
print(median_age.info())

pop = pop.add_prefix('pop_')

In [None]:
addDfs = [income,per_capita_income,white,white_not_hispanic,black,asian,hispanic,two_plus_races,\
          hs_grad,bachelors,civilian_labor_force,poverty,unemployment,median_age]
selectCensus = pop.join(addDfs)
selectCensus['area'] = swingCensus['area_2010']
selectCensus['popShift'] = 100 * (selectCensus['pop_2020'] - selectCensus['pop_2000']) / selectCensus['pop_2000']
selectCensus['popDensity_2016'] = selectCensus['pop_2016'] / selectCensus['area'] #Density = number/area
selectCensus['popDensity_2020'] = selectCensus['pop_2020'] / selectCensus['area'] #...
selectCensus['popDensity_2022'] = selectCensus['pop_2022'] / selectCensus['area'] #...
selectCensus.head()

In [None]:
#IDENTICAL FEATURE SHAPES FOR EACH YEAR
select2016 = selectCensus.filter(regex = '2016').copy()
print(select2016.info())
#select2016.head()
select2020 = selectCensus.filter(regex = '2020').copy()
print(select2020.info())
#select2020.head()
select2022 = selectCensus.filter(regex = '2022').copy()
print(select2022.info())
select2022.head()

In [None]:
predElect = change.drop(['2020','shift'], axis = 1).copy()
target = change['2020']
series2022 = pd.Series()
series2020 = pd.Series()
for eachCounty in predElect.index: #FIPS
    catSeries = predElect.loc[pop.index == eachCounty] #series of each FIPS's known data 
    catSeries = catSeries.T #transpose series
    catSeries.dropna(inplace = True)
    catSeries = catSeries.reset_index().rename(columns = {eachCounty:'cat','index':'year'})
    #catSeries['year'] = int(catSeries['year'])
    #catSeries.year = pd.to_numeric(catSeries.year, errors='coerce')
    X = np.c_[catSeries['year']]
    #print(X)
    y = np.c_[catSeries['cat']]
    linRegMod = LinearRegression()
    linRegMod.fit(X, y)
    predict2022 = float(linRegMod.coef_[0][0] * 2022 + linRegMod.intercept_[0])
    predict2020 = float(linRegMod.coef_[0][0] * 2020 + linRegMod.intercept_[0])
    if predict2020 > 1:
        predict2020 = 1
    if predict2020 < 0:
        predict2020 = 0
    if predict2022 > 1:
        predict2022 = 1
    if predict2022 < 0:
        predict2022 = 0
    each2022 = pd.Series([predict2022], index = [eachCounty]) #(FIPS Index, PredCatValue)
    each2020 = pd.Series([predict2020], index = [eachCounty])
    series2022 = series2022.append(each2022)
    series2020 = series2020.append(each2020)
df2022 = pd.DataFrame([series2022]).T
df2022 = df2022.rename(columns = {0:'2022'})
df2020 = pd.DataFrame([series2020]).T
df2020 = df2020.rename(columns = {0:'2020'})
predElect = predElect.join((df2020,df2022))
predElect = predElect.sort_index(axis = 1)
predElect = predElect.round(3)
print('DEM % predictions: 2020 & 2022:')
predElect.head(10)

In [None]:
predElect.describe()

In [None]:
checkResult = predElect['2020']
checkResult = checkResult.to_frame('pred2020')
checkResult['target2020'] = change['2020']
checkResult['error2020'] = checkResult['pred2020'] - checkResult['target2020']
print(checkResult.describe())
print('\nr2 score:           ',round(r2_score(checkResult['target2020'], checkResult['pred2020']),4))
print('mean squared error: ',round(mean_squared_error(checkResult['target2020'], checkResult['pred2020']),4))
print('mean absolute error:',round(mean_absolute_error(checkResult['target2020'], checkResult['pred2020']),4))
ax1 = checkResult.plot.scatter(x = 'target2020',
                                     y = 'pred2020',
                                     c = 'error2020',
                                     colormap = 'RdBu',
                                     alpha = 0.5)
plt.plot(checkResult['target2020'], checkResult['target2020'], color = 'blue', label = 'target') #y = x line: error = 0;

In [None]:
predElect['state'] = swingElect['state']
addElect2020 = swingElect.filter(regex = '2020').copy()
predElect = predElect.join(addElect2020)
predElect['predDEM_2020'] = predElect['2020'] * predElect['TOT_2020'] 
SwStTOT_2020 = predElect.groupby(['state']).TOT_2020.sum()                         #total 20 vote count by state
SwStDEM_2020_pred = predElect.groupby(['state']).predDEM_2020.sum()                #pred 20 DEM vote count by state
SwStPerDEM_2020_pred = (SwStDEM_2020_pred / SwStTOT_2020).round(decimals = 4).sort_values() #pred percent DEM 20 by state
print('Actual 2020:')
print(StPerDEM_2020[20:34]) #actual
print('\nPredicted 2020:')
print(SwStPerDEM_2020_pred) #predicted 

In [None]:
X = select2020.drop(columns = ['pop_2020','civilian_labor_force_2020'], axis = 1) #drop; high corr w/ popDensity
print(X.info())
X.head()
stdScaler = StandardScaler()
stdScaler.fit(X)
Xscale = stdScaler.transform(X)
scaledFeatures20 = pd.DataFrame(Xscale, columns = X.columns, index = X.index)
pca = PCA().fit(scaledFeatures20)
cumsum = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize = (10, 6))
plt.plot(cumsum)
plt.title('Variance x Feature Components',fontsize = 20)
plt.xlabel('Number of Features',fontsize = 15)
plt.ylabel('Cumulative Explained Variance',fontsize = 15);
plt.yticks(fontsize = 10)
plt.xticks(fontsize = 10)
plt.show()
cumsum

In [None]:
#PCA ON 8 COMPONENTS (Selected 8 because wanted CUMSUM > 0.95, otherwise would've selected 3 at "elbow")
pca20 = PCA(n_components = 8).fit(scaledFeatures20)
pcaScores20 = pca20.transform(scaledFeatures20)
pca8Features20 = pd.DataFrame(pcaScores20,index = scaledFeatures20.index) 
silhouetteScores = []
for k in range(2,13): #iterate on number of clusters to find optimal score
    km = KMeans(n_clusters = k) #classifier
    y_pred = km.fit_predict(pca8Features20) #train each model
    centers = km.cluster_centers_
    score = silhouette_score(pca8Features20, y_pred) #score each trained model
    silhouetteScores.append(score) #add score to array of scores
    print('For k = {}, silhouette score on scaled PCA(8) data: {})'.format(k, round(score,4)))
plt.figure(figsize=(10,6))
plt.plot(range(2, 13), silhouetteScores)
plt.title('Selecting number of clusters k using silhouette score (Scaled PCA(8) Data)', fontsize = 20)
plt.xlabel('K (No. of Clusters)',fontsize = 15)
plt.ylabel('Silhouette Score',fontsize = 15)
plt.yticks(fontsize = 10)
plt.xticks(fontsize = 10)
plt.show()

In [None]:
#CLUSTERING ON 4 CLUSTERS
km = KMeans(n_clusters = 4)
cluster4_pca8Features20 = pca8Features20.copy() #copy preprocessed dataset
y_pred_20  = km.fit_predict(cluster4_pca8Features20)
cluster4_pca8Features20['cluster20'] = y_pred_20

In [None]:
select2020 = select2020.join(addElect2020)
select2020['cluster_2020'] = y_pred_20
select2020['state'] = swingElect['state']
select2020['popShift'] = selectCensus['popShift']

select2020['binary_DEM2020'] = np.where(select2020['perDEM_2020'] > 0.5, 1, 0)
print(select2020['cluster_2020'].value_counts())

sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['binary_DEM2020'],
              data = select2020)
plt.title('Binary Democrat 2020 (1:DEM, 0:GOP)',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['perDEM_2020'],
              data = select2020)
plt.axhline(0.5)
plt.title('Percent Democrat 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['popDensity_2020'],
              data = select2020)
plt.title('Population Density 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['bachelors_2020'],
              data = select2020)
plt.title('Percent with Bachelors Degree 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['hs_grad_2020'],
              data = select2020)
plt.title('Percent of High School Graduates 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['white_2020'],
              data = select2020)
plt.title('Demographic: White, 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['white_not_hispanic_2020'],
              data = select2020)
plt.title('Demographic: White -- Not Hispanic, 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['black_2020'],
              data = select2020)
plt.title('Demographic: Black, 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['asian_2020'],
              data = select2020)
plt.title('Demographic: Asian, 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['hispanic_2020'],
              data = select2020)
plt.title('Demographic: Hispanic, 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['unemployment_rate_2020'],
              data = select2020)
plt.title('Unemployment 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['median_age_2020'],
              data = select2020)
plt.title('Median Age 2020',fontsize = 15)
plt.show()
sns.violinplot(x = select2020['cluster_2020'],
              y = select2020['popShift'],
              data = select2020)
plt.title('Population Shift 2000 - 2020',fontsize = 15)
plt.show()

In [None]:
select2020_encoded = pd.get_dummies(select2020, columns = ['cluster_2020'])
select2020_encoded_corr = select2020_encoded.corr()
cluster0_features20 = select2020_encoded_corr['cluster_2020_0'].sort_values(ascending = False)
print('Feature Correlation to Cluster Group 0 Prediction:')
print(cluster0_features20)
cluster1_features20 = select2020_encoded_corr['cluster_2020_1'].sort_values(ascending = False)
print('\nFeature Correlation to Cluster Group 1 Prediction:')
print(cluster1_features20)
cluster2_features20 = select2020_encoded_corr['cluster_2020_2'].sort_values(ascending = False)
print('\nFeature Correlation to Cluster Group 2 Prediction:')
print(cluster2_features20)
cluster3_features20 = select2020_encoded_corr['cluster_2020_3'].sort_values(ascending = False)
print('\nFeature Correlation to Cluster Group 3 Prediction:')
print(cluster3_features20)
select2020.groupby(['cluster_2020']).mean()

In [None]:
mapThis = select2020['cluster_2020'].reset_index()
fig = px.choropleth(mapThis, geojson = counties, locations = 'FIPS', color = 'cluster_2020',
                    #color_continuous_scale = 'rdbu',
                    range_color = (0,3),
                    scope = 'usa',
                    labels = {'cluster_2020':'Cluster'},
                    title = '4-Cluster on StdScaled PCA(8) Census Data'
                   )
fig.update_geos(fitbounds = 'locations') #use ONLY IF dataset is a state vs the entire USA
fig.update_layout(margin = {'r':0,'t':30,'l':0,'b':0},
                  coloraxis_colorbar = dict(
                      tickvals = [0,1,2,3]))
fig.show()

In [None]:
animateThis = change['2000']
animateThis = animateThis.to_frame('perDEM')
animateThis['perDEM'] = 100 * animateThis['perDEM']
animateThis = animateThis.assign(year = '2000')
animateThis.reset_index(inplace = True)
for each in changeYears:
    add = change[str(each)]
    add = add.to_frame('perDEM')
    add['perDEM'] = 100 * add['perDEM']
    add = add.assign(year = str(each))
    add.reset_index(inplace = True)
    animateThis = pd.concat([animateThis,add])
print(animateThis.info())
fig = px.choropleth(animateThis, geojson = counties, locations = 'FIPS', color = 'perDEM',
                    color_continuous_scale = 'rdbu',
                    range_color = (0,100),
                    scope = 'usa',
                    animation_frame = 'year',
                    animation_group = 'perDEM',
                    labels = {'perDEM':'DEM %'},
                    title = 'Percent DEM by Year',
                    fitbounds = 'locations',
                    height = 600
                   )
#fig.update_geos(fitbounds = 'locations') #use ONLY IF dataset is a state vs the entire USA
fig.update_layout(margin = {'r':0,'t':30,'l':0,'b':0}) #margins right,top,left,bottom
fig.show()
animateThis.head()

In [None]:
mapThis = selectCensus['popShift'].reset_index()
print('Population Shift Percentages Over Last Two Decades:')
print(mapThis.describe())
fig = px.choropleth(mapThis, geojson = counties, locations = 'FIPS', color = 'popShift',
                    color_continuous_scale = 'rdbu',
                    range_color = (-50,165),
                    scope = 'usa',
                    labels = {'popShift':'Percentage Growth'},
                    title = 'Population Growth Percentage 2000-2020 (If < 0 = Population Decline)'
                   )
fig.update_geos(fitbounds = 'locations')
fig.update_layout(margin = {'r':0,'t':30,'l':0,'b':0})
fig.show()

In [None]:
mapThis = change['shift'].reset_index()
#print(mapThis.describe())
fig = px.choropleth(mapThis, geojson = counties, locations = 'FIPS', color = 'shift',
                    color_continuous_scale = 'rdbu',
                    range_color = (-35,35),
                    scope = 'usa',
                    labels = {'shift':'% shift'},
                    title = 'Percent DEM shift 2000 - 2020'
                   )
fig.update_geos(fitbounds = 'locations')
fig.update_layout(margin = {'r':0,'t':30,'l':0,'b':0}) #margins right,top,left,bottom
fig.show()

In [None]:
predElect22 = predElect.copy()
predElect22['state'] = swingElect['state']
predElect22['TOT_2022'] = predElect['TOT_2020'] * (select2022['pop_2022'] / select2020['pop_2020'])
addElect2022 = swingElect.filter(regex = '2022').copy()
predElect22 = predElect22.join(addElect2022)
predElect22['predDEM_2022'] = predElect22['2022'] * predElect22['TOT_2022'] 

mapThis = predElect22['2022'].reset_index()
mapThis['2022'] = 100 * mapThis['2022']
#print(mapThis.describe())
fig = px.choropleth(mapThis, geojson = counties, locations = 'FIPS', color = '2022',
                    color_continuous_scale = 'rdbu',
                    range_color = (0,100),
                    scope = 'usa',
                    labels = {'2022':'DEM %'},
                    title = 'Percent DEM prediction 2022'
                   )
fig.update_geos(fitbounds = 'locations')
fig.update_layout(margin = {'r':0,'t':30,'l':0,'b':0}) #margins right,top,left,bottom
fig.show()

In [None]:
SwStTOT_2022 = predElect22.groupby(['state']).TOT_2022.sum()                         #total 20 vote count by state
SwStDEM_2022_pred = predElect22.groupby(['state']).predDEM_2022.sum()                #pred 20 DEM vote count by state
SwStPerDEM_2022_pred = (SwStDEM_2022_pred / SwStTOT_2022).round(decimals = 4).sort_values() #pred percent DEM 20 by state
print('Predicted 2022:')
print(SwStPerDEM_2022_pred) #predicted 

In [None]:
select2022.info()

In [None]:
X_2022 = select2022.drop(columns = ['pop_2022','civilian_labor_force_2022'], axis = 1) #drop; high corr w/ popDensity
stdScaler = StandardScaler()
stdScaler.fit(X_2022)
Xscale_2022 = stdScaler.transform(X_2022)
scaledFeatures22 = pd.DataFrame(Xscale_2022, columns = X_2022.columns, index = X_2022.index)
pca = PCA().fit(scaledFeatures22)
pca22 = PCA(n_components = 8).fit(scaledFeatures22)
pcaScores22 = pca22.transform(scaledFeatures22)
pca8Features22 = pd.DataFrame(pcaScores22,index = scaledFeatures22.index) 
cluster4_pca8Features22 = pca8Features22.copy() #copy preprocessed dataset
y_pred_22  = km.predict(cluster4_pca8Features22)
cluster4_pca8Features22['cluster22'] = y_pred_22
select2022['cluster_2022'] = y_pred_22
select2022['state'] = swingElect['state']
print('2020 4-clusters:')
print(select2020['cluster_2020'].value_counts())
print('\n2022 4-clusters:')
print(select2022['cluster_2022'].value_counts())

In [None]:
mapThis = select2022['cluster_2022'].reset_index()
fig = px.choropleth(mapThis, geojson = counties, locations = 'FIPS', color = 'cluster_2022',
                    #color_continuous_scale = 'rdbu',
                    range_color = (0,3),
                    scope = 'usa',
                    labels = {'cluster_2022':'Cluster'},
                    title = '4-Cluster on StdScaled PCA(8) Census Data; Fit 2020, Predict 2022'
                   )
fig.update_geos(fitbounds = 'locations') #use ONLY IF dataset is a state vs the entire USA
fig.update_layout(margin = {'r':0,'t':30,'l':0,'b':0},
                  coloraxis_colorbar = dict(
                      tickvals = [0,1,2,3]))
fig.show()