In [1]:
# import dependencies

import os
import csv
import pandas as pd

### 1. Merging of the detailed CSVs which contain individual posts, their corresponding positive, negative and neutral sentiment scores, and the candidate the sentiment refers to.

##### 2016 Dataset cleaning and preparation for merge

In [2]:
# Read in 2016 CSV file
ind_post_data_2016_df = pd.read_csv("../CSV_Outputs/sent_and_cand_by_post_2016.csv",)

# Display sample data
ind_post_data_2016_df.head(5)

Unnamed: 0,Compound,Positive,Negative,Neutral,text,token_the,token_question,token_in,token_this,token_election,...,token_illustrates,token_duh,token_lilredfrmkokomo,token_lowell,token_iloveidevices,token_minimizing,token_dependency,token_salriccobono,token_troyconway,candidate
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,Democrat
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",,,,,,...,,,,,,,,,,Democrat
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,,,,,,...,,,,,,,,,,Democrat
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",,,,,,...,,,,,,,,,,Democrat
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,,,,,,...,,,,,,,,,,Democrat


In [3]:
# Drop columns that start with 'token_'
columns_to_drop = [col for col in ind_post_data_2016_df.columns if col.startswith('token_')]
ind_post_data_2016_df = ind_post_data_2016_df.drop(columns=columns_to_drop)

# Verify the result
ind_post_data_2016_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,Democrat
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",Democrat
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,Democrat
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",Democrat
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,Democrat


In [6]:
# Add a column 'Year' with a date value for each row
ind_post_data_2016_df['Year'] = pd.to_datetime('2016', format='%Y')

# Verify the result
ind_post_data_2016_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,Democrat,2016-01-01
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",Democrat,2016-01-01
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,Democrat,2016-01-01
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",Democrat,2016-01-01
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,Democrat,2016-01-01


In [7]:
# Rename the columns
ind_post_data_2016_df = ind_post_data_2016_df.rename(columns={
    'text': 'Text',
    'candidate': 'Candidate'
})

# Display the updated DataFrame
ind_post_data_2016_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,Democrat,2016-01-01
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",Democrat,2016-01-01
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,Democrat,2016-01-01
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",Democrat,2016-01-01
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,Democrat,2016-01-01


##### 2020 Dataset cleaning and preparation for merge

In [8]:
# Read in 2020 CSV file
ind_post_data_2020_df = pd.read_csv("../CSV_Outputs/sent_and_cand_by_post_2020.csv",)

# Display sample data
ind_post_data_2020_df.head(5)

Unnamed: 0,id,handle,text,time,candidate,unique_id,Compound,Positive,Negative,Neutral
0,1.316529e+18,elsollatinonews,#Elecciones2020 | En #Florida: #JoeBiden dice ...,2020-10-15 00:00:01,democrat,0,0.0,0.0,0.0,1.0
1,1.316529e+18,snarke,"#Trump: As a student I used to hear for years,...",2020-10-15 00:00:02,republican,1,0.5905,0.071,0.0,0.929
2,1.316529e+18,Ranaabtar,You get a tie! And you get a tie! #Trump ‘s ra...,2020-10-15 00:00:08,republican,2,0.0,0.0,0.0,1.0
3,1.316529e+18,FarrisFlagg,@CLady62 Her 15 minutes were over long time ag...,2020-10-15 00:00:17,republican,3,-0.4912,0.0,0.126,0.874
4,1.316529e+18,sm_gulledge,@DeeviousDenise @realDonaldTrump @nypost There...,2020-10-15 00:00:18,republican,4,-0.2617,0.056,0.078,0.866


In [9]:
# Define the desired order of columns
desired_columns = ['Compound', 'Positive', 'Negative', 'Neutral', 'text', 'candidate']

# Select and reorder the DataFrame columns
ind_post_data_2020_df = ind_post_data_2020_df[desired_columns]

# Display the resulting DataFrame
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,democrat
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",republican
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,republican
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,republican
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,republican


In [11]:
# Add a column 'Year' with a date value for each row
ind_post_data_2020_df['Year'] = pd.to_datetime('2020', format='%Y')

# Verify the result
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate,Year
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,democrat,2020-01-01
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",republican,2020-01-01
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,republican,2020-01-01
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,republican,2020-01-01
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,republican,2020-01-01


In [12]:
# Rename the columns
ind_post_data_2020_df = ind_post_data_2020_df.rename(columns={
    'text': 'Text',
    'candidate': 'Candidate'
})

# Display the updated DataFrame
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,democrat,2020-01-01
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",republican,2020-01-01
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,republican,2020-01-01
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,republican,2020-01-01
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,republican,2020-01-01


In [13]:
# Replace values in the 'Candidate' column
ind_post_data_2020_df['Candidate'] = ind_post_data_2020_df['Candidate'].replace({
    'democrat': 'Democrat',
    'republican': 'Republican'
})

# Display the updated DataFrame
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,Democrat,2020-01-01
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",Republican,2020-01-01
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,Republican,2020-01-01
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,Republican,2020-01-01
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,Republican,2020-01-01


##### 2024 Dataset cleaning and preparation for merge

In [14]:
# Read in 2024 CSV file
ind_post_data_2024_df = pd.read_csv("../CSV_Outputs/sent_and_cand_by_post_2024.csv",)

# Display sample data
ind_post_data_2024_df.head(5)

Unnamed: 0,Compound,Positive,Negative,Neutral,text,token_while,token_much,token_of,token_the,token_was,...,token_north,token_carolina,token_gov,token_roy,token_rumored,token_short,token_list,token_possible,token_mates,candidate
0,0.9713,0.247,0.023,0.729,(ThyBlackMan.com) While much of the U.S. was f...,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,republican
1,-0.2732,0.047,0.112,0.841,"President Trump says under his administration,...",,,,0.0,,...,,,,,,,,,,republican
2,0.0,0.0,0.0,1.0,Gunman Thomas Matthew Crooks and veteran Bill ...,,,,0.0,,...,,,,,,,,,,
3,0.4588,0.107,0.0,0.893,"Now that Biden is out of the race, Chairperson...",,,0.0,0.0,,...,,,,,,,,,,democrat
4,0.7351,0.177,0.0,0.823,Elected officials from San Diego County reacte...,0.0,,0.0,0.0,,...,,,,,,,,,,republican


In [17]:
# Drop columns that start with 'token_'
columns_to_drop = [col for col in ind_post_data_2024_df.columns if col.startswith('token_')]
ind_post_data_2024_df = ind_post_data_2024_df.drop(columns=columns_to_drop)

# Filter out rows where the 'Candidate' column is not 'Republican' or 'Democrat'
valid_candidates = ['republican', 'democrat']
ind_post_data_2024_df = ind_post_data_2024_df[ind_post_data_2024_df['candidate'].isin(valid_candidates)]

# Verify the result
ind_post_data_2024_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate,Year
0,0.9713,0.247,0.023,0.729,(ThyBlackMan.com) While much of the U.S. was f...,republican,2024-01-01
1,-0.2732,0.047,0.112,0.841,"President Trump says under his administration,...",republican,2024-01-01
3,0.4588,0.107,0.0,0.893,"Now that Biden is out of the race, Chairperson...",democrat,2024-01-01
4,0.7351,0.177,0.0,0.823,Elected officials from San Diego County reacte...,republican,2024-01-01
6,0.5267,0.121,0.039,0.84,(marketscreener.com) The dollar eased on Monda...,republican,2024-01-01


In [18]:
# Add a column 'Year' with a date value for each row
ind_post_data_2024_df['Year'] = pd.to_datetime('2024', format='%Y')

# Verify the result
ind_post_data_2024_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate,Year
0,0.9713,0.247,0.023,0.729,(ThyBlackMan.com) While much of the U.S. was f...,republican,2024-01-01
1,-0.2732,0.047,0.112,0.841,"President Trump says under his administration,...",republican,2024-01-01
3,0.4588,0.107,0.0,0.893,"Now that Biden is out of the race, Chairperson...",democrat,2024-01-01
4,0.7351,0.177,0.0,0.823,Elected officials from San Diego County reacte...,republican,2024-01-01
6,0.5267,0.121,0.039,0.84,(marketscreener.com) The dollar eased on Monda...,republican,2024-01-01


In [19]:
# Rename the columns
ind_post_data_2024_df = ind_post_data_2024_df.rename(columns={
    'text': 'Text',
    'candidate': 'Candidate'
})

# Display the updated DataFrame
ind_post_data_2024_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.9713,0.247,0.023,0.729,(ThyBlackMan.com) While much of the U.S. was f...,republican,2024-01-01
1,-0.2732,0.047,0.112,0.841,"President Trump says under his administration,...",republican,2024-01-01
3,0.4588,0.107,0.0,0.893,"Now that Biden is out of the race, Chairperson...",democrat,2024-01-01
4,0.7351,0.177,0.0,0.823,Elected officials from San Diego County reacte...,republican,2024-01-01
6,0.5267,0.121,0.039,0.84,(marketscreener.com) The dollar eased on Monda...,republican,2024-01-01


In [20]:
# Replace values in the 'Candidate' column
ind_post_data_2024_df['Candidate'] = ind_post_data_2024_df['Candidate'].replace({
    'democrat': 'Democrat',
    'republican': 'Republican'
})

# Display the updated DataFrame
ind_post_data_2024_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.9713,0.247,0.023,0.729,(ThyBlackMan.com) While much of the U.S. was f...,Republican,2024-01-01
1,-0.2732,0.047,0.112,0.841,"President Trump says under his administration,...",Republican,2024-01-01
3,0.4588,0.107,0.0,0.893,"Now that Biden is out of the race, Chairperson...",Democrat,2024-01-01
4,0.7351,0.177,0.0,0.823,Elected officials from San Diego County reacte...,Republican,2024-01-01
6,0.5267,0.121,0.039,0.84,(marketscreener.com) The dollar eased on Monda...,Republican,2024-01-01


##### Concatenate the 3 dataframes into a single dataframe

In [23]:
# Concatenate them into a single DataFrame
final_ind_post_data_df = pd.concat([ind_post_data_2016_df, ind_post_data_2020_df, ind_post_data_2024_df], ignore_index=True)

# Verify the result
print(f"Total rows in combined DataFrame: {len(final_ind_post_data_df)}")
final_ind_post_data_df.head()

Total rows in combined DataFrame: 258793


Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,Democrat,2016-01-01
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",Democrat,2016-01-01
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,Democrat,2016-01-01
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",Democrat,2016-01-01
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,Democrat,2016-01-01


In [24]:
# Save the final combined individual post data dataFrame to a CSV file
final_ind_post_data_df.to_csv('../CSV_Outputs/final_ind_post_data_df.csv', index=False)

### 2. Merging of the summary CSVs which contain the average sentiment scores per candidate for the three years of available data.

##### 2016 Dataset cleaning and preparation for merge

In [26]:
# Read in 2016 CSV file
summary_data_2016_df = pd.read_csv("../CSV_Outputs/sent_info_for_db_2016.csv",)

# Display sample data
summary_data_2016_df.head(5)

Unnamed: 0,id,text,time,candidate,average_sentiment
0,780925634159796224,The question in this election: Who can put the...,2016,Democrat,0.364082
1,780916180899037184,"Last night, Donald Trump said not paying taxes...",2016,Democrat,0.364082
2,780911564857761793,Couldn't be more proud of @HillaryClinton. Her...,2016,Democrat,0.364082
3,780907038650068994,"If we stand together, there's nothing we can't...",2016,Democrat,0.364082
4,780897419462602752,Both candidates were asked about how they'd co...,2016,Democrat,0.364082


##### 2020 Dataset cleaning and preparation for merge

In [28]:
# Read in 2020 CSV file
summary_data_2020_df = pd.read_csv("../CSV_Outputs/sent_info_for_db_2020.csv",)

# Display sample data
summary_data_2020_df.head(5)

Unnamed: 0,id,text,time,candidate,average_sentiment
0,1.316529e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,2020,democrat,0.00162
1,1.316529e+18,"#Trump: As a student I used to hear for years,...",2020,republican,-0.002182
2,1.316529e+18,You get a tie! And you get a tie! #Trump ‘s ra...,2020,republican,-0.002182
3,1.316529e+18,@CLady62 Her 15 minutes were over long time ag...,2020,republican,-0.002182
4,1.316529e+18,@DeeviousDenise @realDonaldTrump @nypost There...,2020,republican,-0.002182


##### 2024 Dataset cleaning and preparation for merge

In [29]:
# Read in 2024 CSV file
summary_data_2024_df = pd.read_csv("../CSV_Outputs/sent_info_for_db_2024.csv",)

# Display sample data
summary_data_2024_df.head(5)

Unnamed: 0,id,text,time,candidate,average_sentiment
0,0,(ThyBlackMan.com) While much of the U.S. was f...,2024,republican,0.45169
1,1,"President Trump says under his administration,...",2024,republican,0.45169
2,4,"Now that Biden is out of the race, Chairperson...",2024,democrat,0.378815
3,5,Elected officials from San Diego County reacte...,2024,republican,0.45169
4,7,(marketscreener.com) The dollar eased on Monda...,2024,republican,0.45169


In [None]:
##### Concatenate the 3 dataframes into a single dataframe