<a href="https://colab.research.google.com/github/bheki-maenetja/Calc50/blob/master/Models2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Development

## Imports

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json

# Load the CSV file into a DataFrame
df = pd.read_parquet("hf://datasets/Jacobvs/PoliticalTweets/formatted_data.parquet")

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set stop words
stop_words = set(stopwords.words('english'))

# Function to clean tweets
def clean_tweet(tweet):
    tweet = tweet.lower()  # Convert to lowercase
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)  # Remove URLs
    tweet = re.sub(r'\@\w+|\#','', tweet)  # Remove mentions and hashtags
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)  # Remove punctuation
    tweet_tokens = word_tokenize(tweet)  # Tokenize the tweet
    filtered_words = [word for word in tweet_tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)

# Apply cleaning function to tweets
df['Cleaned_Tweet'] = df['text'].apply(clean_tweet)

# Combine all tweets into a single text
republican_tweets = df[df['party'] == 'Republican']['Cleaned_Tweet'].tolist()
democratic_tweets = df[df['party'] == 'Democrat']['Cleaned_Tweet'].tolist()

# Define the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2))  # Use bigrams

# Fit the vectorizer on all tweets
all_tweets = republican_tweets + democratic_tweets
tfidf_vectorizer.fit(all_tweets)

# Transform tweets to get TF-IDF scores
repub_tfidf = tfidf_vectorizer.transform(republican_tweets)
dem_tfidf = tfidf_vectorizer.transform(democratic_tweets)

# Calculate average TF-IDF scores for each phrase
repub_avg_tfidf = np.array(repub_tfidf.mean(axis=0)).flatten()
dem_avg_tfidf = np.array(dem_tfidf.mean(axis=0)).flatten()

# Get the feature names (phrases)
phrases = tfidf_vectorizer.get_feature_names_out()

# Calculate the difference in average TF-IDF scores
tfidf_diff = repub_avg_tfidf - dem_avg_tfidf

# Get phrases with the largest differences
num_phrases = 50  # Number of top phrases to extract
top_repub_phrases = [phrases[i] for i in tfidf_diff.argsort()[-num_phrases:]]
top_dem_phrases = [phrases[i] for i in tfidf_diff.argsort()[:num_phrases]]

# Build the dictionary
partisan_dict = {
    "republican": top_repub_phrases,
    "democratic": top_dem_phrases
}

# # Function to save data to a JSON file
# def save_to_json(data, filename):
#     with open(filename, "w") as outfile:
#         json.dump(data, outfile, indent=4)

# # Save the dictionary to a JSON file
# save_to_json(partisan_dict, "distinguishing_partisan_dictionary.json")

# print("Partisan dictionary saved to distinguishing_partisan_dictionary.json")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
partisan_dict['democratic']

['health care',
 'american rescue',
 'rescue plan',
 'gun violence',
 'climate change',
 'voting rights',
 'child care',
 'make sure',
 'bipartisan infrastructure',
 'climate crisis',
 'across country',
 'im proud',
 'working families',
 'get done',
 'mental health',
 'clean energy',
 'right vote',
 'social security',
 'get vaccinated',
 'infrastructure law',
 'years ago',
 'back better',
 'tax credit',
 'child tax',
 'loved ones',
 'john lewis',
 'open enrollment',
 'build back',
 'im glad',
 'federal funding',
 'inflation reduction',
 'reduction act',
 'goodpaying jobs',
 'health insurance',
 'must pass',
 'dark money',
 'new mexico',
 'lower costs',
 'ill keep',
 'affordable health',
 'climate action',
 'new hampshire',
 'hate crimes',
 'town hall',
 'im working',
 'granite staters',
 'keep working',
 'paid leave',
 'house passed',
 'civil rights']

In [3]:
partisan_dict['republican']

['white house',
 'energy independence',
 'petroleum reserve',
 'energy production',
 'reckless tax',
 'illegal aliens',
 'took office',
 'border policies',
 'federal government',
 'brave men',
 'taxpayer dollars',
 'spending bill',
 'strategic petroleum',
 'democrats want',
 'vaccine mandates',
 'year high',
 'irs agents',
 'crisis border',
 'biden took',
 'open border',
 'border security',
 'tax spending',
 'god bless',
 'law enforcement',
 'big tech',
 'border patrol',
 'crisis southern',
 'democrats reckless',
 'communist party',
 'chinese communist',
 'gas prices',
 'reckless spending',
 'communist china',
 'spending spree',
 'secure border',
 'biden administrations',
 'vaccine mandate',
 'american energy',
 'biden admin',
 'illegal immigrants',
 'american people',
 'national security',
 'joe bidens',
 'men women',
 'president bidens',
 'border crisis',
 'president biden',
 'joe biden',
 'biden administration',
 'southern border']