In [None]:
# Step 1: Install Required Libraries (if needed)
!pip install scikit-learn pandas numpy gdown

# Step 2: Clone the Project Repository or Upload Data
!unzip /content/CIS_5300_Final_Project-main.zip -d /content/

# Step 3: Navigate to the Project Directory
%cd /content/CIS_5300_Final_Project-main/

Archive:  /content/CIS_5300_Final_Project-main.zip
4c01898bf78ffdc539f20b411693d7fc1ab8d11b
   creating: /content/CIS_5300_Final_Project-main/
  inflating: /content/CIS_5300_Final_Project-main/README.md  
  inflating: /content/CIS_5300_Final_Project-main/Simple_Baseline.ipynb  
  inflating: /content/CIS_5300_Final_Project-main/data.ipynb  
  inflating: /content/CIS_5300_Final_Project-main/data.md  
   creating: /content/CIS_5300_Final_Project-main/data/
   creating: /content/CIS_5300_Final_Project-main/data/2020_elections_results/
  inflating: /content/CIS_5300_Final_Project-main/data/2020_elections_results/president_county_candidate.csv  
   creating: /content/CIS_5300_Final_Project-main/data/2020_tweets/
  inflating: /content/CIS_5300_Final_Project-main/data/2020_tweets/.gitignore  
   creating: /content/CIS_5300_Final_Project-main/data/factoid_reddit/
  inflating: /content/CIS_5300_Final_Project-main/data/factoid_reddit/.gitignore  
   creating: /content/CIS_5300_Final_Project-main/

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [None]:
# Setup Directories and Download Files
# Create required directories if not already present
os.makedirs('raw_data/labeled_tweets_georgetown/', exist_ok=True)
os.makedirs('data/labeled_tweets_georgetown/', exist_ok=True)
os.makedirs('raw_data/2020_tweets/', exist_ok=True)
os.makedirs('raw_data/factoid_reddit/', exist_ok=True)

# Download necessary files
!gdown 1qrrznvcHkyUPoxq4GbauPxcT3mMGILVw  # archive.zip
!gdown 1m_0tZXsqQSxaogvby83B5CRzgJcspFvU  # reddit_corpus_unbalanced_filtered.gzip

# Move files to the correct directories
!mv archive.zip raw_data/2020_tweets/
!mv reddit_corpus_unbalanced_filtered.gzip raw_data/factoid_reddit/

Downloading...
From (original): https://drive.google.com/uc?id=1qrrznvcHkyUPoxq4GbauPxcT3mMGILVw
From (redirected): https://drive.google.com/uc?id=1qrrznvcHkyUPoxq4GbauPxcT3mMGILVw&confirm=t&uuid=dba3a5b1-9b65-4455-a115-eb58e4becae9
To: /content/CIS_5300_Final_Project-main/archive.zip
100% 370M/370M [00:23<00:00, 15.8MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1m_0tZXsqQSxaogvby83B5CRzgJcspFvU
From (redirected): https://drive.google.com/uc?id=1m_0tZXsqQSxaogvby83B5CRzgJcspFvU&confirm=t&uuid=4140ec9f-cd50-44cb-9e70-bce533ae4b86
To: /content/CIS_5300_Final_Project-main/reddit_corpus_unbalanced_filtered.gzip
100% 369M/369M [00:07<00:00, 49.2MB/s]


In [None]:
# Preprocess Georgetown Labeled Tweets Dataset
print("\nProcessing Georgetown Labeled Tweets Dataset...")
raw_data_path = 'raw_data/labeled_tweets_georgetown/'
output_path = 'data/labeled_tweets_georgetown/'

# Define stance determination function
def determine_stance(row):
    if row['label'] == "NONE":
        return 0
    if (row['label'] == "FAVOR" and row['candidate'] == "Trump") or \
       (row['label'] == "AGAINST" and row['candidate'] == "Biden"):
        return 1
    if (row['label'] == "FAVOR" and row['candidate'] == "Biden") or \
       (row['label'] == "AGAINST" and row['candidate'] == "Trump"):
        return -1

# Process files in the labeled tweets directory
for csv_file in os.listdir(raw_data_path):
    if csv_file.endswith(".csv"):
        file_path = os.path.join(raw_data_path, csv_file)
        df = pd.read_csv(file_path)

        df['stance'] = df.apply(determine_stance, axis=1)
        df = df.drop(columns=['tweet_id', 'label', 'candidate'])

        output_file_path = os.path.join(output_path, csv_file)
        df.to_csv(output_file_path, index=False)
print("Georgetown dataset processing complete.\n")


Processing Georgetown Labeled Tweets Dataset...
Georgetown dataset processing complete.



In [None]:
# Preprocess FACTOID Reddit Dataset
print("Processing FACTOID Reddit Dataset...")
raw_data_path = 'raw_data/factoid_reddit/'
output_path = 'data/factoid_reddit/'
os.makedirs(output_path, exist_ok=True)

df_raw = pd.read_pickle(raw_data_path + 'reddit_corpus_unbalanced_filtered.gzip', compression='gzip')

# Keep only necessary columns
columns_to_keep = ['documents', 'pb_factor']
df_raw = df_raw[columns_to_keep]

# Transform into text and stance columns
df = pd.DataFrame({
    "text": df_raw["documents"].apply(lambda x: [tup[1] for tup in x]),
    "stance": df_raw["pb_factor"].apply(lambda x: -1 if x < -0.5 else (1 if x > 0.5 else 0))
})
df = df.explode("text").reset_index(drop=True)

# Split data into train, dev, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save processed datasets
train_df.to_csv(output_path + 'train.csv', index=False)
dev_df.to_csv(output_path + 'dev.csv', index=False)
test_df.to_csv(output_path + 'test.csv', index=False)
print("FACTOID dataset processing complete.\n")

Processing FACTOID Reddit Dataset...
FACTOID dataset processing complete.



In [None]:
# Preprocess 2020 Tweets Dataset
print("Processing 2020 Tweets Dataset...")
!unzip raw_data/2020_tweets/archive.zip -d raw_data/2020_tweets/
raw_data_path = 'raw_data/2020_tweets/'
output_path = 'data/2020_tweets/'

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

# Load data
df_biden = pd.read_csv(raw_data_path + 'hashtag_joebiden.csv', lineterminator='\n')
df_trump = pd.read_csv(raw_data_path + 'hashtag_donaldtrump.csv', lineterminator='\n')

# Remove duplicates and sort by date
df_biden['created_at'] = pd.to_datetime(df_biden['created_at'])
df_biden = df_biden.sort_values(by='created_at', ascending=False).drop_duplicates(subset='tweet_id')

df_trump['created_at'] = pd.to_datetime(df_trump['created_at'])
df_trump = df_trump.sort_values(by='created_at', ascending=False).drop_duplicates(subset='tweet_id')

# Add candidate labels
df_biden['contains'] = "Biden"
df_trump['contains'] = "Trump"

# Merge datasets and clean
df = pd.concat([df_biden, df_trump])
df['contains'] = df.groupby('tweet_id')['contains'].transform(
    lambda x: 'Both' if len(set(x)) > 1 else x
)
df = df.drop_duplicates(subset='tweet_id').reset_index(drop=True)

# Split data into train, dev, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save processed datasets
train_df.to_csv(output_path + 'train.csv', index=False)
dev_df.to_csv(output_path + 'dev.csv', index=False)
test_df.to_csv(output_path + 'test.csv', index=False)
print("2020 Tweets dataset processing complete.\n")

Processing 2020 Tweets Dataset...
Archive:  raw_data/2020_tweets/archive.zip
replace raw_data/2020_tweets/hashtag_donaldtrump.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: raw_data/2020_tweets/hashtag_donaldtrump.csv  
replace raw_data/2020_tweets/hashtag_joebiden.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: raw_data/2020_tweets/hashtag_joebiden.csv  
2020 Tweets dataset processing complete.



In [None]:
# Verify Processed Data
print("Processed files:")
print(os.listdir('data/labeled_tweets_georgetown/'))
print(os.listdir('data/factoid_reddit/'))
print(os.listdir('data/2020_tweets/'))

Processed files:
['train.csv', 'dev.csv', '.gitignore', 'test.csv']
['train.csv', 'dev.csv', '.gitignore', 'test.csv']
['.gitignore']


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Load datasets
factoid_train = pd.read_csv('data/factoid_reddit/train.csv', on_bad_lines='skip', engine='python')
factoid_dev = pd.read_csv('data/factoid_reddit/dev.csv', on_bad_lines='skip', engine='python')
factoid_test = pd.read_csv('data/factoid_reddit/test.csv', on_bad_lines='skip', engine='python')

georgetown_train = pd.read_csv('data/labeled_tweets_georgetown/train.csv', on_bad_lines='skip', engine='python')
georgetown_dev = pd.read_csv('data/labeled_tweets_georgetown/dev.csv', on_bad_lines='skip', engine='python')
georgetown_test = pd.read_csv('data/labeled_tweets_georgetown/test.csv', on_bad_lines='skip', engine='python')

# Check for NaNs and fill them with empty strings
for dataset in [factoid_train, factoid_dev, factoid_test, georgetown_train, georgetown_dev, georgetown_test]:
    dataset['text'] = dataset['text'].fillna("")

In [None]:
def train_baseline_model(train_data, dev_data, test_data, vectorizer=None):
    """
    Trains and evaluates a Logistic Regression baseline model.

    Args:
    - train_data, dev_data, test_data: DataFrames with 'text' and 'stance' columns.
    - vectorizer: Optional TfidfVectorizer instance for text vectorization.

    Returns:
    - Logistic Regression model and evaluation results for test data.
    """
    # Split into inputs and labels
    X_train, y_train = train_data['text'], train_data['stance']
    X_dev, y_dev = dev_data['text'], dev_data['stance']
    X_test, y_test = test_data['text'], test_data['stance']

    # Vectorize text
    if vectorizer is None:
        vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 1))
        X_train_vec = vectorizer.fit_transform(X_train)
    else:
        X_train_vec = vectorizer.transform(X_train)

    X_dev_vec = vectorizer.transform(X_dev)
    X_test_vec = vectorizer.transform(X_test)

    # Train a Logistic Regression baseline
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train_vec, y_train)

    # Evaluate on dev and test sets
    y_dev_pred = model.predict(X_dev_vec)
    y_test_pred = model.predict(X_test_vec)

    return model, vectorizer, y_test, y_test_pred

In [None]:
factoid_train = factoid_train.dropna(subset=['stance'])
factoid_dev = factoid_dev.dropna(subset=['stance'])
factoid_test = factoid_test.dropna(subset=['stance'])

In [None]:
# Check for NaNs in the stance column
print(factoid_train['stance'].isna().sum())
print(factoid_dev['stance'].isna().sum())
print(factoid_test['stance'].isna().sum())

0
0
0


In [None]:
factoid_model, factoid_vectorizer, factoid_y_test, factoid_y_test_pred = train_baseline_model(
    factoid_train, factoid_dev, factoid_test
)

In [None]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available. Check your environment.")

GPU is available!
Device Name: Tesla T4


In [None]:
georgetown_train = georgetown_train.dropna(subset=['stance'])
georgetown_dev = georgetown_dev.dropna(subset=['stance'])
georgetown_test = georgetown_test.dropna(subset=['stance'])

In [None]:
# Check for NaNs in the stance column
print(georgetown_train['stance'].isna().sum())
print(georgetown_dev['stance'].isna().sum())
print(georgetown_test['stance'].isna().sum())

0
0
0


In [None]:
georgetown_model, georgetown_vectorizer, georgetown_y_test, georgetown_y_test_pred = train_baseline_model(
    georgetown_train, georgetown_dev, georgetown_test
)

In [None]:
from evaluate import evaluate_model, display_evaluation_results

# FACTOID Evaluation
factoid_labels = sorted(set(factoid_y_test))  # Dynamically get labels from y_test
factoid_results = evaluate_model(factoid_y_test, factoid_y_test_pred, factoid_labels)
factoid_output_file = "factoid_evaluation_results.txt"
display_evaluation_results(factoid_results, factoid_labels, factoid_output_file)

# Read and display FACTOID evaluation results
print("FACTOID Dataset Evaluation:")
with open(factoid_output_file, 'r') as f:
    print(f.read())

# Georgetown Evaluation
georgetown_labels = sorted(set(georgetown_y_test))  # Dynamically get labels from y_test
georgetown_results = evaluate_model(georgetown_y_test, georgetown_y_test_pred, georgetown_labels)
georgetown_output_file = "georgetown_evaluation_results.txt"
display_evaluation_results(georgetown_results, georgetown_labels, georgetown_output_file)

# Read and display Georgetown evaluation results
print("\nGeorgetown Dataset Evaluation:")
with open(georgetown_output_file, 'r') as f:
    print(f.read())

FACTOID Dataset Evaluation:
Model Evaluation Metrics
-------------------------
Accuracy: 0.5751
Macro Precision: 0.4966
Macro Recall: 0.3589
Macro F1-Score: 0.3106

Per-Class Metrics:
Class -1.0: Precision=0.5831, Recall=0.9529, F1-Score=0.7235
Class 0.0: Precision=0.4868, Recall=0.1062, F1-Score=0.1743
Class 1.0: Precision=0.4197, Recall=0.0177, F1-Score=0.0340

Confusion Matrix:
        -1.0    0.0   1.0
-1.0  179939   8606   291
 0.0  104606  12473   421
 1.0   24053   4541   515


Georgetown Dataset Evaluation:
Model Evaluation Metrics
-------------------------
Accuracy: 0.6788
Macro Precision: 0.6919
Macro Recall: 0.5711
Macro F1-Score: 0.6009

Per-Class Metrics:
Class -1.0: Precision=0.7368, Recall=0.4118, F1-Score=0.5283
Class 0.0: Precision=0.6721, Recall=0.8913, F1-Score=0.7664
Class 1.0: Precision=0.6667, Recall=0.4103, F1-Score=0.5079

Confusion Matrix:
      -1.0   0.0   1.0
-1.0    14    18     2
 0.0     4    82     6
 1.0     1    22    16

