In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch
from scipy.stats import pointbiserialr
from sklearn.preprocessing import LabelEncoder
import os

In [2]:
political_leaning = pd.read_csv('datasets/political_leaning.csv')

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer for aggression detection
model_name = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to compute aggression score
def detect_aggression(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1)
    return scores[0][1].item()  # Aggression (toxic) score

tqdm.pandas()

political_leaning["aggression"] = political_leaning["post"].progress_apply(detect_aggression)

100%|██████████████████████████████████████████████████████████████████████████| 57231/57231 [8:51:10<00:00,  1.80it/s]


In [4]:
output_file1 = os.path.join("datasets", "political_leaning_with_aggression2.csv")
political_leaning.to_csv(output_file1, index=False)
print(political_leaning.head())

     auhtor_ID                                               post  \
0  t2_7ramzeng  You can "buy" the show and stream it through t...   
1  t2_7ramzeng  me want to play Q*bert Holy shit, based Alex J...   
2  t2_7ramzeng  Shouldn't rely on any external services or per...   
3  t2_7ramzeng  PR to a specific person. Usually that just mea...   
4  t2_7ramzeng  This article's intention is clear that they wa...   

  political_leaning  aggression  
0             right    0.054291  
1             right    0.002662  
2             right    0.063582  
3             right    0.061321  
4             right    0.031205  


In [6]:
display(political_leaning)

Unnamed: 0,auhtor_ID,post,political_leaning,aggression
0,t2_7ramzeng,"You can ""buy"" the show and stream it through t...",right,0.054291
1,t2_7ramzeng,"me want to play Q*bert Holy shit, based Alex J...",right,0.002662
2,t2_7ramzeng,Shouldn't rely on any external services or per...,right,0.063582
3,t2_7ramzeng,PR to a specific person. Usually that just mea...,right,0.061321
4,t2_7ramzeng,This article's intention is clear that they wa...,right,0.031205
...,...,...,...,...
57226,t2_4ngvl16j,a good one? That's odd. I remember it as being...,center,-100.000000
57227,t2_4ngvl16j,"boring shit in the fucking world. ""History doe...",center,-100.000000
57228,t2_4ngvl16j,you see no contradiction there? Why or why not...,center,-100.000000
57229,t2_4ngvl16j,is only created by an incommensurate worldview...,center,-100.000000


In [None]:
#display(political_leaning)

# run for like 30 different aspects of the text: aggression, sentiment, spelling, emoji
# obtain a value for each one
# then using something equivalent to linear regression identify 5-10 most relevant aspects
# use them for prediction, further refine model
# outcomes:
#  1: prediction of political leaning/age/whatever variable we decide
#  2: do this through investigating:
#  2.1: patterns in aggression, spelling, ..., across different population demographics
#  2.2: patterns in the interaction between different aspects of writing, via interaction terms
#  2.3: some findings on ease of predicting gender, age and political leaning


# 1.2.3. Sentiment (positive, negative, neutral)
# 4. Aggression
# 5.6.7.8. Emotion Analysis (e.g., joy, anger, sadness, fear), 
# 9. Polarity
# 10. Readability (e.g., Flesch Reading Ease)
# 11. Topic Modeling

# use a combination of sentiment models, for example use 3: one binary, one non binary, choose ones that work on a different basis 
# use ensemble learning

# after training model


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

# Load model and tokenizer for sentiment analysis
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define a maximum length for input sequences
MAX_LENGTH = 512  # Typical maximum length for most transformer models

# Function to compute sentiment score
def detect_sentiment(text):
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding="max_length", 
        max_length=MAX_LENGTH
    )
    outputs = model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1)
    sentiment = torch.argmax(scores).item()  # 0: Negative, 1: Neutral, 2: Positive
    return sentiment

tqdm.pandas()

# Apply sentiment detection
political_leaning["sentiment"] = political_leaning["post"].progress_apply(detect_sentiment)


100%|██████████████████████████████████████████████████████████████████████████| 57231/57231 [9:25:43<00:00,  1.69it/s]


In [5]:
output_file2 = os.path.join("datasets", "political_leaning_with_aggression3.csv")
political_leaning.to_csv(output_file2, index=False)

     auhtor_ID                                               post  \
0  t2_7ramzeng  You can "buy" the show and stream it through t...   
1  t2_7ramzeng  me want to play Q*bert Holy shit, based Alex J...   
2  t2_7ramzeng  Shouldn't rely on any external services or per...   
3  t2_7ramzeng  PR to a specific person. Usually that just mea...   
4  t2_7ramzeng  This article's intention is clear that they wa...   

  political_leaning  sentiment  
0             right          1  
1             right          0  
2             right          1  
3             right          1  
4             right          0  


In [4]:
kaggle_df = pd.read_csv('datasets/Extra dataset.csv', nrows=500)

In [2]:
import pandas as pd

kaggle_df2 = pd.read_csv('datasets/Extra dataset.csv', nrows=500)

# Check for NaNs in the dataframe
nan_check = kaggle_df2.isna().sum()
print(nan_check[nan_check > 0])

Series([], dtype: int64)


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

#Running for the second alternative dataset

model_name = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to compute aggression score
def detect_aggression(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1)
    return scores[0][1].item()  # Aggression (toxic) score

tqdm.pandas()

kaggle_df["aggression"] = kaggle_df["tweet"].progress_apply(detect_aggression)

output_file = os.path.join("datasets", "KaggleAggresion.csv")
kaggle_df.to_csv(output_file, index=False)
display(kaggle_df.head())

Unnamed: 0,political_leaning,tweet,aggression
0,Conservative,Won’t happen. This judges loathes Trump.,0.002125
1,Conservative,#Trump2024,0.054055
2,Liberal,Folks don't transition because cramps hurt...\...,0.022347
3,Conservative,Title is misleading. Makes it sound like 82% v...,0.076696
4,Liberal,This is facts,0.06863


In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

# Load model and tokenizer for sentiment analysis
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define a maximum length for input sequences
MAX_LENGTH = 512  # Typical maximum length for most transformer models

# Function to compute sentiment score
def detect_sentiment(text):
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding="max_length", 
        max_length=MAX_LENGTH
    )
    outputs = model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1)
    sentiment = torch.argmax(scores).item()  # 0: Negative, 1: Neutral, 2: Positive
    return sentiment

tqdm.pandas()

# Apply sentiment detection
kaggle_df["sentiment"] = kaggle_df["tweet"].progress_apply(detect_sentiment)
output_file = os.path.join("datasets", "KaggleAggresion.csv")
kaggle_df.to_csv(output_file, index=False)
display(kaggle_df.head())

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [05:39<00:00,  1.47it/s]


Unnamed: 0,political_leaning,tweet,aggression,sentiment
0,Conservative,Won’t happen. This judges loathes Trump.,0.002125,0
1,Conservative,#Trump2024,0.054055,1
2,Liberal,Folks don't transition because cramps hurt...\...,0.022347,0
3,Conservative,Title is misleading. Makes it sound like 82% v...,0.076696,0
4,Liberal,This is facts,0.06863,1


None