In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Hopothesis

My hypothesis would be that higher entropy within a party’s primaries could suggest that the party is more divided, which might lead to a less unified voter base. This could potentially reduce their chances of winning in the general election, as the lack of cohesion might hinder their ability to mobilize support effectively.

On the other hand, lower entropy might indicate a more unified party, which could increase their chances of winning the state, as they would likely have a more cohesive voter base heading into the general election.

So, if the entropy is higher in a party's primaries, the party might be at a disadvantage in terms of winning the state, assuming that voter fragmentation impacts overall suppo

In [3]:
# Reading in Data
df_Democrats = pd.read_excel(
    "precinct-dem.xlsx",
    sheet_name="Master",
    engine="openpyxl",
    usecols=[0,1,2,7,8,9],
    header=1,
    skiprows=[2,3]
)

df_Republicans = pd.read_excel(
    "precinct-rep.xlsx",
    sheet_name="Master",
    engine="openpyxl",
    usecols=[1,2,7,8,9,10,11,12,13,14,15,16],
    header=1,
    skiprows=[2,3]
)

FileNotFoundError: [Errno 2] No such file or directory: 'precinct-dem.xlsx'

In [37]:
column_names_Dem = df_Democrats.columns.tolist()
column_names_Rep = df_Republicans.columns.tolist()

In [38]:
# Ensure numeric columns for vote calculation
candidates_Rep = column_names_Rep[2:]  # Assume the first two columns are not candidates
base_Rep = len(candidates_Rep)

# Calculate total number of votes for Republican candidates
df_Republicans['Total Number of Votes'] = df_Republicans[candidates_Rep].apply(pd.to_numeric, errors='coerce').sum(axis=1)

# Calculate the percentage for each candidate
for candidate in candidates_Rep:
    percentage_col = f'{candidate.split(" (")[0]} %'  # Create percentage column name
    df_Republicans[percentage_col] = (
        df_Republicans[candidate] / df_Republicans['Total Number of Votes']
    ).fillna(0)  # Handle NaNs by filling with 0

# Create a list of percentage columns
percentage_cols = [f'{candidate.split(" (")[0]} %' for candidate in candidates_Rep]

# Calculate entropy using log base 3
df_Republicans['Entropy'] = -df_Republicans[percentage_cols].apply(
    lambda row: sum(row * np.log(row + 1e-9) / np.log(base_Rep)), axis=1
)


In [39]:
df_Democrats

Unnamed: 0,County Name,Precinct Name,Precinct Code,Hillary Clinton (D),"Roque ""Rocky"" De La Fuente (D)",Bernie Sanders (D)
0,Adams,BRATTON TOWNSHIP,AAA,27,1,26
1,Adams,BRUSH CREEK TOWNSHIP,AAB,36,0,28
2,Adams,LOCUST GROVE,AAD,39,0,34
3,Adams,GREEN TOWNSHIP,AAE,35,0,22
4,Adams,JEFFERSON TOWNSHIP,AAG,22,1,21
...,...,...,...,...,...,...
8882,Wyandot,RIDGE TS,ABH,17,1,8
8883,Wyandot,SALEM TS,ABI,38,1,18
8884,Wyandot,SYCAMORE VILLAGE,ABJ,31,1,28
8885,Wyandot,SYCAMORE TS,ABL,20,1,15


## Democrat

In [40]:
# Calculate total votes
candidates_Dem = column_names_Dem[3:]
base_Dem = len(column_names_Dem[3:])
df_Democrats['Total Number of Votes(Dem)'] = df_Democrats[candidates_Dem].sum(axis=1)

# Calculate the percentage for each candidate
for candidate in candidates_Dem:
    percentage_col_Dem = f'{candidate.split(" (")[0]} %'  # Create percentage column name
    df_Democrats[percentage_col_Dem] = df_Democrats[candidate] / df_Democrats['Total Number of Votes(Dem)']

# Calculate entropy using log base 3
percentage_cols_Dem = [f'{candidate.split(" (")[0]} %' for candidate in candidates_Dem]
df_Democrats['Entropy'] = -df_Democrats[percentage_cols_Dem].apply(
    lambda row: sum(row * np.log(row + 1e-9) / np.log(base_Dem)), axis=1
)


## Republican

In [42]:
# Calculate total votes
candidates_Rep = column_names_Rep[2:]
base_Rep = len(column_names_Rep[2:])
df_Republicans['Total Number of Votes(Rep)'] = df_Republicans[column_names_Rep].sum(axis=1)

# Calculate the percentage for each candidate
for candidate in candidates_Rep:
    percentage_col_Rep = f'{candidate.split(" (")[0]} %'  # Create percentage column name
    df_Republicans[percentage_col_Rep] = df_Republicans[candidate] / df_Republicans['Total Number of Votes(Rep)']

# Calculate entropy using log base 3
percentage_cols_Rep = [f'{candidate.split(" (")[0]} %' for candidate in candidates_Rep]
df_Republicans['Entropy'] = -df_Republicans[percentage_cols].apply(
    lambda row: sum(row * np.log(row + 1e-9) / np.log(base_Rep)), axis=1
)

  df_Republicans['Total Number of Votes(Rep)'] = df_Republicans[column_names_Rep].sum(axis=1)


In [None]:
df_Democrats

In [None]:
data1 = df_Democrats["Entropy"]
data2 = df_Republicans["Entropy"]

In [None]:
# Plotting Data
plt.hist(data1, bins=30, alpha=0.5, label='Democrats', color='blue')
plt.hist(data2, bins=30, alpha=0.5, label='Republicans', color='red')

# Add labels and a legend
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Entropy Histogram')
plt.legend()

# Display the plot
plt.show()

In [None]:
df_Republicans.columns

In [None]:
# Filtering out data based on who won

democrat_entropy_median = df_Democrats[
    (df_Democrats["Hillary Clinton %"] >= df_Democrats['Roque "Rocky" De La Fuente %']) & 
    (df_Democrats["Hillary Clinton %"] >= df_Democrats["Bernie Sanders %"])
]["Entropy"].median()

republican_entropy_median = df_Republicans[
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Jeb  Bush %"]) & 
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Ben Carson %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Chris  Christie %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Carly Fiorina %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Mike  Huckabee %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["John R. Kasich %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Rick Santorum %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Ted  Cruz %"]) &  
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Marco Rubio %"])
]["Entropy"].median()

In [None]:
democrat_entropy_mean = df_Democrats[
    (df_Democrats["Hillary Clinton %"] >= df_Democrats['Roque "Rocky" De La Fuente %']) & 
    (df_Democrats["Hillary Clinton %"] >= df_Democrats["Bernie Sanders %"])
]["Entropy"].mean()

republican_entropy_mean = df_Republicans[
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Jeb  Bush %"]) & 
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Ben Carson %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Chris  Christie %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Carly Fiorina %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Mike  Huckabee %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["John R. Kasich %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Rick Santorum %"]) &
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Ted  Cruz %"]) &  
    (df_Republicans["Donald J. Trump %"] >= df_Republicans["Marco Rubio %"])
]["Entropy"].mean()

In [None]:
# Print all 4 data pieces (median and mean) for both parties
print(f"Democratic Entropy Median: {democrat_entropy_median:.4f}")
print(f"Republican Entropy Median: {republican_entropy_median:.4f}")
print(f"Democratic Entropy Mean: {democrat_entropy_mean:.4f}")
print(f"Republican Entropy Mean: {republican_entropy_mean:.4f}")

In [None]:
df_Democrats