## Imports

In [None]:
import torch
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import textstat
from sklearn.feature_extraction.text import CountVectorizer

print(f"MPS built: {torch.backends.mps.is_built()}")       # Should return True
print(f"MPS available: {torch.backends.mps.is_available()}")  # Should return True

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

In [None]:
political_leaning = pd.read_csv('datasets/political_leaning.csv')
political_leaning.head()

## Create a heuristic of the polluted data as a baseline model for later comparison

In [None]:
def classify_political_leaning(text):
    conservative_keywords = ['freedom', 'economy', 'taxes', 'patriot', 'security']
    liberal_keywords = ['equality', 'rights', 'climate', 'justice', 'diversity']
    
    # Convert text to lowercase for case-insensitive matching
    text = text.lower()
    
    # Count occurrences of conservative and liberal keywords
    conservative_count = sum(1 for word in conservative_keywords if word in text)
    liberal_count = sum(1 for word in liberal_keywords if word in text)
    
    # Assign a label based on the counts
    if conservative_count > liberal_count:
        return 'right'
    elif liberal_count > conservative_count:
        return 'left'
    else:
        return 'center'

In [None]:
# Apply the heuristic model to the processed posts
political_leaning['heuristic_political_leaning'] = political_leaning['post'].apply(classify_political_leaning) #change the cplumn to the one with text.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(political_leaning['political_leaning'], political_leaning['heuristic_political_leaning']))

In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_true = political_leaning['political_leaning']  # Ground truth labels
y_pred = political_leaning['heuristic_political_leaning']       # Predicted labels from heuristic model

# Generate a classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")


## Create a heuristic of the clean data as a baseline model for later comparison

In [None]:
import pandas as pd
political_leaning_clean = pd.read_csv('datasets/political_leaning_clean.csv') #add non 
political_leaning_clean.head()

In [None]:
# Apply the heuristic model to the processed posts
political_leaning_clean['heuristic_political_leaning'] = political_leaning_clean['post'].apply(classify_political_leaning)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(political_leaning_clean['political_leaning'], political_leaning_clean['heuristic_political_leaning']))

In [None]:
from sklearn.metrics import classification_report, accuracy_score
y_true = political_leaning_clean['political_leaning']  # Ground truth labels
y_pred = political_leaning_clean['heuristic_political_leaning']       # Predicted labels from heuristic model

print("Classification Report:")
print(classification_report(y_true, y_pred))
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")
