In [1]:
# import dependencies

import os
import csv
import pandas as pd

### Merging of the detailed CSVs which contain individual posts, their corresponding positive, negative and neutral sentiment scores, and the candidate the sentiment refers to.

##### 2016 Dataset cleaning and preparation for merge

In [2]:
# Read in 2016 CSV file
ind_post_data_2016_df = pd.read_csv("../CSV_Outputs/sent_and_cand_by_post_2016.csv",)

# Display sample data
ind_post_data_2016_df.head(5)

Unnamed: 0,Compound,Positive,Negative,Neutral,text,token_the,token_question,token_in,token_this,token_election,...,token_illustrates,token_duh,token_lilredfrmkokomo,token_lowell,token_iloveidevices,token_minimizing,token_dependency,token_salriccobono,token_troyconway,candidate
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,Democrat
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",,,,,,...,,,,,,,,,,Democrat
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,,,,,,...,,,,,,,,,,Democrat
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",,,,,,...,,,,,,,,,,Democrat
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,,,,,,...,,,,,,,,,,Democrat


In [3]:
# Drop columns that start with 'token_'
columns_to_drop = [col for col in ind_post_data_2016_df.columns if col.startswith('token_')]
ind_post_data_2016_df = ind_post_data_2016_df.drop(columns=columns_to_drop)

# Verify the result
ind_post_data_2016_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,Democrat
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",Democrat
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,Democrat
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",Democrat
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,Democrat


In [6]:
# Add a column 'Year' with a date value for each row
ind_post_data_2016_df['Year'] = pd.to_datetime('2016', format='%Y')

# Verify the result
ind_post_data_2016_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,Democrat,2016-01-01
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",Democrat,2016-01-01
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,Democrat,2016-01-01
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",Democrat,2016-01-01
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,Democrat,2016-01-01


In [7]:
# Rename the columns
ind_post_data_2016_df = ind_post_data_2016_df.rename(columns={
    'text': 'Text',
    'candidate': 'Candidate'
})

# Display the updated DataFrame
ind_post_data_2016_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.4404,0.139,0.0,0.861,The question in this election: Who can put the...,Democrat,2016-01-01
1,0.0,0.0,0.0,1.0,"Last night, Donald Trump said not paying taxes...",Democrat,2016-01-01
2,0.185,0.165,0.102,0.733,Couldn't be more proud of @HillaryClinton. Her...,Democrat,2016-01-01
3,0.1376,0.128,0.101,0.771,"If we stand together, there's nothing we can't...",Democrat,2016-01-01
4,-0.6597,0.0,0.278,0.722,Both candidates were asked about how they'd co...,Democrat,2016-01-01


##### 2020 Dataset cleaning and preparation for merge

In [8]:
# Read in 2020 CSV file
ind_post_data_2020_df = pd.read_csv("../CSV_Outputs/sent_and_cand_by_post_2020.csv",)

# Display sample data
ind_post_data_2020_df.head(5)

Unnamed: 0,id,handle,text,time,candidate,unique_id,Compound,Positive,Negative,Neutral
0,1.316529e+18,elsollatinonews,#Elecciones2020 | En #Florida: #JoeBiden dice ...,2020-10-15 00:00:01,democrat,0,0.0,0.0,0.0,1.0
1,1.316529e+18,snarke,"#Trump: As a student I used to hear for years,...",2020-10-15 00:00:02,republican,1,0.5905,0.071,0.0,0.929
2,1.316529e+18,Ranaabtar,You get a tie! And you get a tie! #Trump ‘s ra...,2020-10-15 00:00:08,republican,2,0.0,0.0,0.0,1.0
3,1.316529e+18,FarrisFlagg,@CLady62 Her 15 minutes were over long time ag...,2020-10-15 00:00:17,republican,3,-0.4912,0.0,0.126,0.874
4,1.316529e+18,sm_gulledge,@DeeviousDenise @realDonaldTrump @nypost There...,2020-10-15 00:00:18,republican,4,-0.2617,0.056,0.078,0.866


In [9]:
# Define the desired order of columns
desired_columns = ['Compound', 'Positive', 'Negative', 'Neutral', 'text', 'candidate']

# Select and reorder the DataFrame columns
ind_post_data_2020_df = ind_post_data_2020_df[desired_columns]

# Display the resulting DataFrame
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,democrat
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",republican
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,republican
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,republican
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,republican


In [11]:
# Add a column 'Year' with a date value for each row
ind_post_data_2020_df['Year'] = pd.to_datetime('2020', format='%Y')

# Verify the result
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,candidate,Year
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,democrat,2020-01-01
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",republican,2020-01-01
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,republican,2020-01-01
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,republican,2020-01-01
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,republican,2020-01-01


In [12]:
# Rename the columns
ind_post_data_2020_df = ind_post_data_2020_df.rename(columns={
    'text': 'Text',
    'candidate': 'Candidate'
})

# Display the updated DataFrame
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,democrat,2020-01-01
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",republican,2020-01-01
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,republican,2020-01-01
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,republican,2020-01-01
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,republican,2020-01-01


In [13]:
# Replace values in the 'Candidate' column
ind_post_data_2020_df['Candidate'] = ind_post_data_2020_df['Candidate'].replace({
    'democrat': 'Democrat',
    'republican': 'Republican'
})

# Display the updated DataFrame
ind_post_data_2020_df.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,Candidate,Year
0,0.0,0.0,0.0,1.0,#Elecciones2020 | En #Florida: #JoeBiden dice ...,Democrat,2020-01-01
1,0.5905,0.071,0.0,0.929,"#Trump: As a student I used to hear for years,...",Republican,2020-01-01
2,0.0,0.0,0.0,1.0,You get a tie! And you get a tie! #Trump ‘s ra...,Republican,2020-01-01
3,-0.4912,0.0,0.126,0.874,@CLady62 Her 15 minutes were over long time ag...,Republican,2020-01-01
4,-0.2617,0.056,0.078,0.866,@DeeviousDenise @realDonaldTrump @nypost There...,Republican,2020-01-01


##### 2024 Dataset cleaning and preparation for merge

In [None]:
# Read in 2016 CSV file
ind_post_data_2024_df = pd.read_csv("../CSV_Outputs/sent_and_cand_by_post_2024.csv",)

# Display sample data
ind_post_data_2024_df.head(5)

In [None]:
# Drop columns that start with 'token_'
columns_to_drop = [col for col in ind_post_data_2016_df.columns if col.startswith('token_')]
ind_post_data_2016_df = ind_post_data_2016_df.drop(columns=columns_to_drop)

# Verify the result
print(f"Columns after dropping: {ind_post_data_2016_df.columns.tolist()}")
ind_post_data_2016_df.head()

In [None]:
# Add a column 'Year' with a date value for each row
ind_post_data_2016_df['Year'] = pd.to_datetime('2016', format='%Y')

# Verify the result
print(f"Columns after adding 'Year': {ind_post_data_2016_df.columns.tolist()}")
ind_post_data_2016_df.head()