## Data Pre-Processing

In [1]:
import json

In [2]:
def extract_data(filename):
    data = dict()
    with open(filename) as f:
        data = json.load(f)
    return data

In [3]:
def get_posts_titles(data):
    posts = dict()

    for k in data.keys():
        state_posts = []
        for p in data[k]:
            post = {
                "selftext": p["selftext"],
                "title": p["title"]
            }
            state_posts.append(post)
        posts[k] = state_posts
    return posts

In [4]:
def get_comments(data):
    comments = dict()

    for k in data.keys():
        state = data[k]
        state_comments = []
        
        for p in state.keys():
            for c in state[p]:
                comment = {
                    "body": c["body"]
                }
                state_comments.append(comment)
        
        comments[k] = state_comments
    return comments

In [5]:
# get raw data for after and before election
raw_after_posts = extract_data('data/after_election_posts_data.json')
raw_after_comments = extract_data('data/after_election_SAMPLE_comments_data.json')
raw_before_posts = extract_data('data/before_election_posts_data.json')
raw_before_comments = extract_data('data/before_election_SAMPLE_comments_data.json')

In [6]:
# extract just the useful data from both posts and comments
after_posts = get_posts_titles(raw_after_posts)
before_posts = get_posts_titles(raw_before_posts)

In [7]:
after_comments = get_comments(raw_after_comments)
before_comments = get_comments(raw_before_comments)

## Filter political posts

In [8]:
# list of words to look for when filtering political posts
keywords = extract_data('data/keywords/keywords.json')

In [9]:
keywords

{'economy': ['prices',
  'inflation',
  'triggering',
  'stock',
  'scores',
  'expensive',
  'felt',
  'rates',
  'respondents',
  'professional',
  'economic',
  'stamps',
  'repercussions',
  'roxanne',
  'afloat',
  'aquaculture',
  'priced',
  'manageable',
  'succumbing',
  'necessities'],
 'democracy': ['siphoned',
  'mudde',
  'legalized',
  'hence',
  'capitalism',
  'rectify',
  'columns',
  'newest',
  'reich',
  'acquisitions',
  'robertreich',
  'stalled',
  'nausea',
  'misallocation',
  'inbox',
  'rigs',
  'cas',
  'eldercare',
  'summoned',
  'corporations'],
 'security': ['social',
  'shortfall',
  'finances',
  'insolvency',
  '2031',
  'footing',
  'mismanage',
  'proposing',
  'waving',
  'trustees',
  'insolvent',
  'modernize',
  'holland',
  'ss',
  '2033',
  'breck',
  'dumas',
  'angrily',
  'buckle',
  'oasi'],
 'immigration': ['immigrants',
  'policies',
  'border',
  'forgiveness',
  'country',
  'about',
  'sues',
  'migrants',
  'illegally',
  'wanted',
 

In [15]:
# filter some keywords out based on reasoning
words_to_remove = {
    'economy': ["triggering", "felt", "afloat", "manageable", "succumbing"],
    'democracy': ["mudde", "hence", "newest", "stalled", "nausea", "rigs", "cas", "summoned"],
    'security': ["footing", "proposing", "holland", "angrily", "buckle"],
    'immigration': ["forgiveness", "country", "about", "wanted", "harris", "said", "did", "from", "trump", "on", "for", "pressed", "mass"],
    'education': ["ferial", "flopped", "oriented", "greenlight", "partially", "sparsely"],
    'healthcare': ["creators", "1tn", "concepts", "blowtorch", "replacing", "boot", "sections", "incorporated", "takers", "empower"],
    'abortion': ["exceptions", "procedure", "obtain", "care", "guaranteeing", "moderates", "sections", "callous"]
}

In [13]:
words_to_replace = {
    'economy': {
        'roxanne': 'roxanne persaud'
    },
    'security': {
        'ss': 'ssa'
    },
    'education': {
        'betsy': 'betsy devos'
    }
}

In [18]:
def filter_kws():
    filtered_kws = keywords.copy()
    
    for w in filtered_kws.keys():
        filtered = [wd for wd in filtered_kws[w] if wd not in words_to_remove[w]]
        filtered_kws[w] = filtered

        if w in words_to_replace.keys():
            replacement = words_to_replace[w]
            og_word = list(replacement.keys())[0]
            new_word = replacement[og_word]

            filtered_kws[w].remove(og_word)
            filtered_kws[w].append(new_word)
    return filtered_kws

In [28]:
filtered_kws = filter_kws()
filtered_kws

{'economy': ['prices',
  'inflation',
  'stock',
  'scores',
  'expensive',
  'rates',
  'respondents',
  'professional',
  'economic',
  'stamps',
  'repercussions',
  'aquaculture',
  'priced',
  'necessities',
  'roxanne persaud'],
 'democracy': ['siphoned',
  'legalized',
  'capitalism',
  'rectify',
  'columns',
  'reich',
  'acquisitions',
  'robertreich',
  'misallocation',
  'inbox',
  'eldercare',
  'corporations'],
 'security': ['social',
  'shortfall',
  'finances',
  'insolvency',
  '2031',
  'mismanage',
  'waving',
  'trustees',
  'insolvent',
  'modernize',
  '2033',
  'breck',
  'dumas',
  'oasi',
  'ssa'],
 'immigration': ['immigrants',
  'policies',
  'border',
  'sues',
  'migrants',
  'illegally',
  'illegal'],
 'education': ['pearson',
  'indoctrination',
  'classrooms',
  'devos',
  '529',
  'futureed',
  'subsidize',
  'absenteeism',
  'charters',
  'schreiner',
  'petrilli',
  'fordham',
  'sidelined',
  'betsy devos'],
 'healthcare': ['schar',
  'outpolls',
  '

In [35]:
# group filtered political posts 
def group_political_posts(posts):
    grouped_posts = dict()
        
    for k in posts.keys():
        grouped_posts[k] = dict()
        state_posts = posts[k]

        for topic in filtered_kws.keys():
            post_set = set()
            
            for post in state_posts:
                for word in filtered_kws[topic]:
                    if word in post["selftext"].lower() or word in post["title"].lower():
                        post_set.add(post["selftext"] + " " + post["title"])
                grouped_posts[k][topic] = list(post_set)
    return grouped_posts

In [36]:
grouped_posts_after = group_political_posts(after_posts)
grouped_posts_before = group_political_posts(before_posts)

In [37]:
# group filtered political comments 
def group_political_comments(comments):
    grouped_comments = dict()
        
    for k in comments.keys():
        grouped_comments[k] = dict()
        state_comments = comments[k]

        for topic in filtered_kws.keys():
            comment_set = set()
                
            for comment in state_comments:
                for word in filtered_kws[topic]:
                    if word in comment["body"].lower():
                        comment_set.add(comment["body"])
                grouped_comments[k][topic] = list(comment_set)
    return grouped_comments

In [38]:
grouped_comments_after = group_political_comments(after_comments)
grouped_comments_before = group_political_comments(before_comments)

In [39]:
# print out size of grouped posts per state - for testing
def count_grouped_posts(grouped_posts):
    group_posts_size = dict()
    for k in grouped_posts.keys():
        group_posts_size[k] = dict()
        
        for t in grouped_posts[k].keys(): 
            group_posts_size[k][t] = len(grouped_posts[k][t])
    return group_posts_size

In [40]:
count_grouped_posts(grouped_posts_after)

{'california': {'economy': 5,
  'democracy': 0,
  'security': 1,
  'immigration': 5,
  'education': 0,
  'healthcare': 0,
  'abortion': 6},
 'michigan': {'economy': 11,
  'democracy': 0,
  'security': 16,
  'immigration': 10,
  'education': 3,
  'healthcare': 2,
  'abortion': 18},
 'colorado': {'economy': 2,
  'democracy': 0,
  'security': 4,
  'immigration': 1,
  'education': 0,
  'healthcare': 0,
  'abortion': 4},
 'oregon': {'economy': 12,
  'democracy': 1,
  'security': 8,
  'immigration': 16,
  'education': 0,
  'healthcare': 0,
  'abortion': 23},
 'hawaii': {'economy': 10,
  'democracy': 0,
  'security': 5,
  'immigration': 8,
  'education': 0,
  'healthcare': 6,
  'abortion': 19},
 'oklahoma': {'economy': 3,
  'democracy': 0,
  'security': 7,
  'immigration': 7,
  'education': 2,
  'healthcare': 1,
  'abortion': 12},
 'maryland': {'economy': 14,
  'democracy': 0,
  'security': 14,
  'immigration': 11,
  'education': 1,
  'healthcare': 6,
  'abortion': 31},
 'arizona': {'economy'

In [41]:
count_grouped_posts(grouped_comments_after)

{'nevada': {'economy': 87,
  'democracy': 5,
  'security': 54,
  'immigration': 84,
  'education': 2,
  'healthcare': 11,
  'abortion': 116},
 'wyoming': {'economy': 86,
  'democracy': 2,
  'security': 54,
  'immigration': 53,
  'education': 3,
  'healthcare': 18,
  'abortion': 240}}

## Filter posts by candidate

In [None]:
candidate_keywords = {
    "trump": ["trump", "donald", "donald trump", "republican"],
    "harris": ["harris", "kamala", "kamala harris", "democrat"]
}

In [None]:
# filter for candidate posts
def filter_candidate_posts(all_posts):
    candidate_posts = dict()
    
    for k in all_posts.keys():
        candidate_posts[k] = dict()
        
        for candidate in candidate_keywords.keys():
            for w in candidate_keywords[candidate]:
                posts = set()
                
                for post in all_posts[k]:
                    if w in post["selftext"].lower() or w in post["title"].lower():
                        posts.add(post["selftext"] + " " + post["title"])
        
            candidate_posts[k][candidate] = list(posts)
    return candidate_posts

In [None]:
after_candidate_posts = filter_candidate_posts(after_posts)
before_candidate_posts = filter_candidate_posts(before_posts)

In [None]:
# print out size of political posts per state - for testing
def count_candidate_posts(candidate_posts):
    can_posts_size = dict()
    for k in candidate_posts.keys():
        can_posts_size[k] = dict()
        for candidate in candidate_keywords.keys():
            can_posts_size[k][candidate] = len(candidate_posts[k][candidate])
    return can_posts_size

In [None]:
count_candidate_posts(after_candidate_posts)

In [None]:
count_candidate_posts(before_candidate_posts)

In [None]:
# filter for candidate comments
def filter_candidate_comments(all_comments):
    candidate_comments = dict()
    
    for k in all_comments.keys():
        candidate_comments[k] = dict()
        
        for candidate in candidate_keywords.keys():
            for w in candidate_keywords[candidate]:
                comments = set()
                
                for post in all_comments[k]:
                    if w in post["body"].lower():
                        comments.add(post["body"])
        
            candidate_comments[k][candidate] = list(comments)
    return candidate_comments

In [None]:
after_candidate_comments = filter_candidate_comments(after_comments)
before_candidate_comments = filter_candidate_comments(before_comments)

In [None]:
count_candidate_posts(after_candidate_comments)

In [None]:
count_candidate_posts(before_candidate_comments)

In [None]:
# DATA FORMAT
# <after/before>_political_posts contains posts that contain at least one of our defined keywords,
# after or before the election respectively.

# <after/before>_political_posts = {
#    'texas': [
#        {
#            'selftext': "__",
#            'title': "__"
#        },
#        {
#            'selftext': "__",
#            'title': "__"}
#        ...
#    ],
#    'california': [
#        {
#            'selftext': "__",
#            'title': "__"
#        },
#        ...
#    ]
# }

# <after/before>_political_comments contains comments that contain at least one of our defined keywords,
# after or before the election respectively.

# <after/before>_political_posts = {
#    'texas': [
#        {
#            'body': "__"
#        },
#        {
#            'body': "__"
#        },
#        ...
#    ],
#    'california': [
#        {
#            'body': "__"
#        },
#        ...
#    ]
# }

# <after/before>_candidate_<posts/comments> contains posts/comments that contain words about each candidate (as defined in candidate_keywords),
# after or before the election respectively.

# <after/before>_candidate_<posts/comments> = {
#    'texas': {
#        'trump': [__, __],
#        'harris': [__, __],
#    },
#    'california': {
#        'trump': [__, __],
#        'harris': [__, __],
#    },
#    ...
# }

## Sentiment Analysis

In [44]:
# sentiment analysis
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import torch

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
## TEST - must use PyTorch version, the tensorflow one is not as accurate
text = "I love you!"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
print(ranking)
print(config.id2label)

for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

In [None]:
# compute highest sentiment score and label for given text
def get_sentiment_label_score(text):
    # get output from model
    encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    output = model(**encoded_input)

    # compute softmax scores
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # ranking = list of labels in decreasing order of sentiment score
    ranking = np.argsort(scores)[::-1]

    # return dictionary of {ranking: score} for given text
    # sentiment_map = dict()
    # for i in range(scores.shape[0]):
    #     l = config.id2label[ranking[i]]
    #     s = scores[ranking[i]]
    #     sentiment_map[l] = s

    # return largest score and the associated sentiment
    l = config.id2label[ranking[0]]
    s = scores[ranking[0]]
    return l, s

## Take sentiment score for each candidate

Get sentiment score for each sentiment. Might look like:
```
'texas': {
    'trump': {
        'positive': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'neutral': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       }
       'negative': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'sentiment': 'positive',
       'avg_sentiment': 0.0
    },
    'harris': {
        'positive': {
            ...
       },
       'neutral': {
           ...
       }
       'negative': {
           ...
       },
       'sentiment': 'positive',
       'avg_sentiment': 0.0
    },
}

In [51]:
# helper for creating the json above
def record_sentiment(sent_data, score):
    if "scores" in sent_data.keys():
        sent_data["scores"].append(score)
    else:
        sent_data["scores"] = [score]

    if "num_posts" in sent_data.keys():
        sent_data["num_posts"] += 1
    else:
        sent_data["num_posts"] = 1

In [None]:
def get_sentiment_scores_grouped_posts(grouped_posts):
    all_sentiment = dict()
    
    for k in grouped_posts.keys():
        all_sentiment[k] = dict()
        
        for candidate in grouped_posts[k].keys():
            all_sentiment[k][candidate] = dict()
            candidate_data = {
                'positive': {},
                'neutral': {},
                'negative': {},
            }

            for post in grouped_posts[k][candidate]:
                label, score = get_sentiment_label_score(post)
                record_sentiment(candidate_data[label], score)
                
                all_sentiment[k][candidate] = candidate_data
    return all_sentiment

In [None]:
# after_cand_sentiments = get_sentiment_scores_grouped_posts(after_candidate_posts)

In [None]:
# before_cand_sentiments = get_sentiment_scores_grouped_posts(before_candidate_posts)

In [None]:
# after_cand_comment_sentiments = get_sentiment_scores_grouped_posts(after_candidate_comments)

In [None]:
# before_cand_comment_sentiments = get_sentiment_scores_grouped_posts(before_candidate_comments)

In [57]:
# class to make numpy types JSON serializable
class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

def save_data(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f, cls=NumpyEncoder, indent=4)

In [None]:
# save_data('after_cand_post_sentiments.json', after_cand_sentiments)
# save_data('before_cand_post_sentiments.json', before_cand_sentiments)
# save_data('after_cand_comment_sentiments.json', after_cand_comment_sentiments)
# save_data('before_cand_comment_sentiments.json', before_cand_comment_sentiments)

In [None]:
after_cand_sentiments = extract_data('after_cand_post_sentiments.json')
before_cand_sentiments = extract_data('before_cand_post_sentiments.json')
after_cand_comment_sentiments = extract_data('after_cand_comment_sentiments.json')
before_cand_comment_sentiments = extract_data('before_cand_comment_sentiments.json')

In [None]:
# get average, min, max, and dominant sentiment for each state
def get_sentiment_stats(all_sentiment):
    stats = all_sentiment.copy()
    
    for k in stats.keys():
        state_sent = stats[k].copy()
        
        for c in state_sent.keys():
            avgs = dict()
            candidate = state_sent[c].copy()

            for s in candidate.keys():
                sent = candidate[s].copy()
                if 'scores' not in sent.keys():
                    sent['scores'] = [0]
                    
                sent['min'] = min(sent['scores'])
                sent['max'] = max(sent['scores'])
                sent['average'] = np.mean(sent['scores'])
                avgs[sent['average']] = s
                
                if 'num_posts' not in sent.keys():
                    sent['num_posts'] = 0
                candidate[s] = sent
            
            if len(avgs.keys()) > 0:
                avg_sentiment = max(avgs.keys())
                sentiment = avgs[avg_sentiment]
                candidate['avg_sentiment'] = avg_sentiment
                candidate['sentiment'] = sentiment
                state_sent[c] = candidate
        stats[k] = state_sent
    return stats

In [None]:
after_cand_stats = get_sentiment_stats(after_cand_sentiments)
before_cand_stats = get_sentiment_stats(before_cand_sentiments)
after_cand_comment_stats = get_sentiment_stats(after_cand_comment_sentiments)
before_cand_comment_stats = get_sentiment_stats(before_cand_comment_sentiments)

In [None]:
save_data('data/candidate_sentiments/after_cand_post_sentiments.json', after_cand_stats)
save_data('data/candidate_sentiments/before_cand_post_sentiments.json', before_cand_stats)
save_data('data/candidate_sentiments/after_cand_comment_sentiments.json', after_cand_comment_stats)
save_data('data/candidate_sentiments/before_cand_comment_sentiments.json', before_cand_comment_stats)

## Political Direction Analysis
Get political direction for each topic. Might look like:
```
'texas': {
    'election': {
        'left': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'center': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       }
       'right': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'direction': 'left/center/right',
       'avg_score': 0.0
    },
    'abortion': {
        'left': {
            ...
       },
       'center': {
            ...
       }
       'right': {
            ...
       },
       'direction': 'left/center/right',
       'avg_score': 0.0
    ...
},
'california': {
    'election': {
        ...
    },
    ...
}

In [45]:
pol_dir_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
pol_dir_model = AutoModelForSequenceClassification.from_pretrained("bucketresearch/politicalBiasBERT")
dir_map = {
    0: 'left',
    1: 'center',
    2: 'right'
}

2024-11-30 18:00:12.812453: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [46]:
# TEST
text = "I don't believe in climate change"
inputs = pol_dir_tokenizer(text, return_tensors="pt")
labels = torch.tensor([0])
outputs = pol_dir_model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(logits.softmax(dim=-1)[0].tolist())

[0.24744857847690582, 0.17667832970619202, 0.5758731365203857]


In [47]:
# compute highest direction score and label for given text
def get_direction_label_score(text):
    inputs = pol_dir_tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    labels = torch.tensor([0])
    outputs = pol_dir_model(**inputs, labels=labels)
    loss, logits = outputs[:2]

    # ranking = list of labels in decreasing order of direction score
    scores = logits.softmax(dim=-1)[0].tolist()
    ranking = np.argsort(logits.softmax(dim=-1)[0].tolist())[::-1]

    # return largest score and the associated sentiment
    l = dir_map[ranking[0]]
    s = scores[ranking[0]]
    return l, s

In [48]:
#TEST
get_direction_label_score("trans lives matter")

('left', 0.4164985120296478)

In [49]:
def get_direction_scores_grouped_posts(grouped_posts):
    all_directions = dict()
    
    for k in grouped_posts.keys():
        all_directions[k] = dict()
        
        for topic in grouped_posts[k].keys():
            all_directions[k][topic] = dict()
            topic_data = {
                'left': {},
                'center': {},
                'right': {},
            }

            for post in grouped_posts[k][topic]:
                direction, score = get_direction_label_score(post)
                record_sentiment(topic_data[direction], score)
                
                all_directions[k][topic] = topic_data
    return all_directions

In [52]:
after_cand_post_dir = get_direction_scores_grouped_posts(grouped_posts_after)

In [None]:
# before_cand_post_dir = get_direction_scores_grouped_posts(grouped_posts_before)

In [53]:
after_cand_comment_dir = get_direction_scores_grouped_posts(grouped_comments_after)

In [None]:
# before_cand_comment_dir = get_direction_scores_grouped_posts(grouped_comments_before)

In [58]:
save_data('data/topic_pol_directions/after_cand_post_directions.json', after_cand_post_dir)
save_data('data/topic_pol_directions/after_cand_comment_directions.json', after_cand_comment_dir)

In [59]:
# get average, min, max, and dominant sentiment for each state
def get_direction_stats(all_directions):
    stats = all_directions.copy()
    
    for k in stats.keys():
        state_dir = stats[k].copy()
        
        for d in state_dir.keys():
            avgs = dict()
            topics = state_dir[d].copy()

            for t in topics.keys():
                topic = topics[t].copy()
                if 'scores' not in topic.keys():
                    topic['scores'] = [0]
                    
                topic['min'] = min(topic['scores'])
                topic['max'] = max(topic['scores'])
                topic['average'] = np.mean(topic['scores'])
                avgs[topic['average']] = t
                
                if 'num_posts' not in topic.keys():
                    topic['num_posts'] = 0
                topics[t] = topic
            
            if len(avgs.keys()) > 0:
                avg_score = max(avgs.keys())
                direction = avgs[avg_score]
                topics['avg_score'] = avg_score
                topics['direction'] = direction
                state_dir[d] = topics
        stats[k] = state_dir
    return stats

In [60]:
after_topic_post_stats = get_direction_stats(after_cand_post_dir)
after_topic_comment_stats = get_direction_stats(after_cand_comment_dir)

In [61]:
save_data('data/topic_pol_directions/after_topic_post_stats.json', after_topic_post_stats)
save_data('data/topic_pol_directions/after_topic_comment_stats.json', after_topic_comment_stats)

# ===== OLD STUFF ===== DO NOT RUN =====

## Option 1: take sentiment of entire post, give an average rating

Get the aggregated sentiment score for each post. Might want something like:
```all_sentiments = {
    'texas': {
        'positive': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'neutral': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       }
       'negative': {
           'scores': [0, 0, ...],
           'average': 0.0,
           'max': 0.0,
           'min': 0.0,
           'num_posts': 0
       },
       'sentiment': 'positive',
       'avg_sentiment': 0.0
    },
    ...
}

In [None]:
# helper for creating the json above
def record_sentiment(sent_data, score):
    if "scores" in sent_data.keys():
        sent_data["scores"].append(score)
    else:
        sent_data["scores"] = [score]

    if "num_posts" in sent_data.keys():
        sent_data["num_posts"] += 1
    else:
        sent_data["num_posts"] = 1

In [None]:
# iterate through each state and compute highest weight sentiment for each post
def get_sentiment_scores_posts(political_posts):
    all_sentiment = dict()
    
    for k in political_posts.keys():
        state_data = {
            'positive': {},
            'neutral': {},
            'negative': {},
        }
        
        for post in political_posts[k]:
            # as we are not working with semantic meaning, we will combine the text and titles of the posts
            content = post['title'] + " " + post['selftext']
            label, score = get_sentiment_label_score(content)
            record_sentiment(state_data[label], score)
            
            all_sentiment[k] = state_data
    return all_sentiment

In [None]:
# this is the comments version of the above
def get_sentiment_scores_comments(political_comments):
    all_sentiment = dict()
    
    for k in political_comments.keys():
        state_data = {
            'positive': {},
            'neutral': {},
            'negative': {},
        }
        
        for post in political_comments[k]:
            content = post['body']
            label, score = get_sentiment_label_score(content)
            record_sentiment(state_data[label], score)
            
            all_sentiment[k] = state_data
    return all_sentiment

In [None]:
#after_post_sentiments = get_sentiment_scores_posts(after_political_posts)

In [None]:
#before_post_sentiments = get_sentiment_scores_posts(before_political_posts)

In [None]:
#after_comment_sentiments = get_sentiment_scores_comments(after_political_comments)

In [None]:
#before_comment_sentiments = get_sentiment_scores_comments(before_political_comments)

In [None]:
# after_post_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/after_post_sentiments.json')
# before_post_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/before_post_sentiments.json')
# after_comment_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/after_comments_sentiments.json')
# before_comment_sentiments = extract_data('reddit-sentiment-analysis/sentiments/all_posts/before_comments_sentiments.json')

In [None]:
# get average, min, max, and dominant sentiment for each state
def get_sentiment_stats(all_sentiment):
    stats = dict()
    for k in all_sentiment.keys():
        avgs = dict()
        stats[k] = dict()
        state_sent = all_sentiment[k]
        
        for s in state_sent.keys():
            sent = state_sent[s]
            if 'scores' not in sent.keys():
                sent['scores'] = [0]
                
            sent['min'] = min(sent['scores'])
            sent['max'] = max(sent['scores'])
            sent['average'] = np.mean(sent['scores'])
            avgs[sent['average']] = s
            
            if 'num_posts' not in sent.keys():
                sent['num_posts'] = 0
    
        avg_sentiment = max(avgs.keys())
        sentiment = avgs[avg_sentiment]
        stats[k]['avg_sentiment'] = avg_sentiment
        stats[k]['sentiment'] = sentiment
    return stats

In [None]:
after_post_sentiments_stats = get_sentiment_stats(after_post_sentiments)
before_post_sentiments_stats = get_sentiment_stats(before_post_sentiments)

In [None]:
after_comment_sentiments_stats = get_sentiment_stats(after_comment_sentiments)
before_comment_sentiments_stats = get_sentiment_stats(before_comment_sentiments)

In [None]:
save_data('after_post_sentiments.json', after_post_sentiments)
save_data('before_post_sentiments.json', before_post_sentiments)
save_data('after_comment_sentiments.json', after_comment_sentiments)
save_data('before_comment_sentiments.json', before_comment_sentiments)

In [None]:
save_data('after_post_stats.json', after_post_sentiments_stats)
save_data('before_post_stats.json', before_post_sentiments_stats)
save_data('after_comment_stats.json', after_comment_sentiments_stats)
save_data('before_comment_stats.json', before_comment_sentiments_stats)

## Option 2: Get sentiment for each topic per state, print the same statistics as above
Further subdivide the political posts by their main topics. Data will look something like this:
```all_sentiments = {
    'texas': {
        'election': {
            'positive': {
                   'scores': [0, 0, ...],
                   'average': 0.0,
                   'max': 0.0,
                   'min': 0.0,
                   'num_posts': 0
               },
               'neutral': {
                   'scores': [0, 0, ...],
                   'average': 0.0,
                   'max': 0.0,
                   'min': 0.0,
                   'num_posts': 0
               }
               'negative': {
                   'scores': [0, 0, ...],
                   'average': 0.0,
                   'max': 0.0,
                   'min': 0.0,
                   'num_posts': 0
               },
               'sentiment': 'positive',
               'avg_sentiment': 0.0
            },
        },
        'republican': {
            'positive': {
                ...
               },
               'neutral': {
                ...
               }
               'negative': {
                ...
               },
               'sentiment': 'positive',
               'avg_sentiment': 0.0
            },
        },
        ...
    },
    ...
}

In [None]:
# group filtered political posts 
def group_political_posts(posts):
    grouped_posts = dict()
        
    for k in posts.keys():
        grouped_posts[k] = dict()
        state_posts = posts[k]

        for post in state_posts:
            for topic in topics_dict.keys():
                post_set = set()
                
                for word in topics_dict[topic]:
                    if word in post["selftext"].lower() or word in post["title"].lower():
                        post_set.add(post["selftext"] + " " + post["title"])
                grouped_posts[k][topic] = list(post_set)
    return grouped_posts

In [None]:
# group filtered political comments 
def group_political_comments(comments):
    grouped_comments = dict()
        
    for k in comments.keys():
        grouped_comments[k] = dict()
        state_comments = comments[k]

        for comment in state_comments:
            for topic in topics_dict.keys():
                comment_set = set()
                
                for word in topics_dict[topic]:
                    if word in comment["body"].lower():
                        comment_set.add(comment["body"])
                grouped_comments[k][topic] = list(comment_set)
    return grouped_comments

In [None]:
grouped_posts_after = group_political_posts(after_political_posts)
grouped_posts_before = group_political_posts(before_political_posts)

In [None]:
grouped_posts_after['michigan']

In [None]:
grouped_comments_after = group_political_comments(after_political_comments)
grouped_comments_before = group_political_comments(before_political_comments)

In [None]:
grouped_comments_after['nevada']

In [None]:
# print out size of grouped posts per state - for testing
def count_grouped_posts(grouped_posts):
    group_posts_size = dict()
    for k in grouped_posts.keys():
        group_posts_size[k] = dict()
        
        for t in grouped_posts[k].keys(): 
            group_posts_size[k][t] = len(grouped_posts[k][t])
    return group_posts_size

In [None]:
#count_grouped_posts(grouped_posts_after)

In [None]:
#count_grouped_posts(grouped_posts_before)

In [None]:
#count_grouped_posts(grouped_comments_after)

In [None]:
#count_grouped_posts(grouped_comments_before)

In [None]:
def get_sentiment_scores_grouped_posts(grouped_posts):
    all_sentiment = dict()
    
    for k in grouped_posts.keys():
        all_sentiment[k] = dict()
        
        for topic in grouped_posts[k].keys():
            all_sentiment[k][topic] = dict()
            topic_data = {
                'positive': {},
                'neutral': {},
                'negative': {},
            }

            for post in grouped_posts[k][topic]:
                label, score = get_sentiment_label_score(post)
                record_sentiment(topic_data[label], score)
                
                all_sentiment[k][topic] = topic_data
    return all_sentiment

In [None]:
post_sentiment_per_topic_after = get_sentiment_scores_grouped_posts(grouped_posts_after)
post_sentiment_per_topic_before = get_sentiment_scores_grouped_posts(grouped_posts_before)

In [None]:
comment_sentiment_per_topic_after = get_sentiment_scores_grouped_posts(grouped_comments_after)
comment_sentiment_per_topic_before = get_sentiment_scores_grouped_posts(grouped_comments_before)

In [None]:
# get average, min, max, and dominant sentiment for each topic per state
def get_sentiment_stats(all_sentiment):
    stats = dict()
    for k in all_sentiment.keys():
        stats[k] = dict()
        
        for topic in all_sentiment[k].keys():
            stats[k][topic] = dict()
            topic_sent = all_sentiment[k][topic]
            avgs = dict()
        
            for s in topic_sent.keys():
                sent = topic_sent[s]
                if 'scores' not in sent.keys():
                    sent['scores'] = [0]
                    
                sent['min'] = min(sent['scores'])
                sent['max'] = max(sent['scores'])
                sent['average'] = np.mean(sent['scores'])
                avgs[sent['average']] = s
                
                if 'num_posts' not in sent.keys():
                    sent['num_posts'] = 0
        
            if len(avgs.keys()) > 0:
                avg_sentiment = max(avgs.keys())
                sentiment = avgs[avg_sentiment]
                stats[k][topic]['avg_sentiment'] = avg_sentiment
                stats[k][topic]['sentiment'] = sentiment
    return stats

In [None]:
post_stats_per_topic_after = get_sentiment_stats(post_sentiment_per_topic_after)
post_stats_per_topic_before = get_sentiment_stats(post_sentiment_per_topic_before)

In [None]:
comment_stats_per_topic_after = get_sentiment_stats(comment_sentiment_per_topic_after)
comment_stats_per_topic_before = get_sentiment_stats(comment_sentiment_per_topic_before)

In [None]:
save_data('after_post_sentiments_per_topic.json', post_sentiment_per_topic_after)
save_data('before_post_sentiments_per_topic.json', post_sentiment_per_topic_before)
save_data('after_comment_sentiments_per_topic.json', comment_sentiment_per_topic_after)
save_data('before_comment_sentiments_per_topic.json', comment_sentiment_per_topic_before)

In [None]:
save_data('after_post_stats_per_topic.json', post_stats_per_topic_after)
save_data('before_post_stats_per_topic.json', post_stats_per_topic_before)
save_data('after_comment_stats_per_topic.json', comment_stats_per_topic_after)
save_data('before_comment_stats_per_topic.json', comment_stats_per_topic_before)

## Testing Topic Modeling for Political Data Extraction
Probably don't run the code below.

In [None]:
from bertopic import BERTopic
topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")

model_topics = topic_model.get_topic_info()

In [None]:
model_topics

In [None]:
# Taking the numbers of topics that contain our list of keywords
political_topic_nums = []
for w in keywords:
    for i in range(len(model_topics["Representation"])):
        if w in model_topics["Representation"][i]:
            political_topic_nums.append(i)

In [None]:
political_topic_nums

In [None]:
# Document needs to be a list to pass into topic modelling model
# Create a new json of { state1: [ "title1, text1", "title2, text2", ... ], state2: ["title1, text1", ...]
state_to_post = dict()

for k in political_posts.keys():
    posts = []
    for post in political_posts[k]:
        post_str = post['title'] + " " + post['selftext']
        posts.append(post_str)
    state_to_post[k] = posts

In [None]:
state_to_topic = dict()

for k in state_to_post.keys():
    if len(state_to_post[k]) != 0:
        post_topics, post_probs = topic_model.transform(state_to_post[k])
        state_to_topic[k] = {
            'topics': post_topics,
            'probabilities': post_probs
        }

In [None]:
state_to_topic

In [None]:
political_posts_modeled = dict()

for k in state_to_topic.keys():
    indices = []
    state = state_to_topic[k]
    state_topics = state['topics']
    state_probs = state['probabilities']

    for i in range(len(state_topics)):
        if state_topics[i] in political_topic_nums and state_probs[i] >= 0.5:
            indices.append(i)
    political_posts_modeled[k] = indices

In [None]:
political_posts_modeled