In [190]:
########################################
# STEP 0: import libraries
########################################
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer 
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
import sklearn.tree
import sklearn.ensemble
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder
import nltk
import pandas as pd
import sklearn.datasets
import sklearn.decomposition
import sklearn.discriminant_analysis
import sklearn.ensemble
import sklearn.linear_model
import sklearn.neural_network
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.neighbors
import sklearn.preprocessing
import sklearn.random_projection
import sklearn.tree
import sklearn.svm
nltk.download('vader_lexicon') # Should print 3.9.1 or another recent version




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/avagrey/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [191]:
###reading in initial###
df = pd.read_csv(("~/Downloads/train_data.csv"))
df.head(1)

Unnamed: 0,X,fullText,candidate_mentioned,day,July,August,September,October,November,likes,retweets,views,comments,engagement_rate,id,Candidate,direction,Sentiment
0,6232,i think what people aren’t taking about enough...,Trump,14,0,1,0,0,0,2.5e-05,1e-05,2.3e-05,1.9e-05,0.020858,1,trump,indirect,negative


In [192]:
########################################
# SUB-STEP: Prepare the dataset
########################################
analyzer = SentimentIntensityAnalyzer()

def get_vader_score(text):
    return analyzer.polarity_scores(text)['compound']

df['vader_score'] = df['fullText'].apply(get_vader_score)


def sentiment_match(vader_score, label):
    if vader_score > 0.05:
        vader_label = "positive"
    elif vader_score < -0.05:
        vader_label = "negative"
    else:
        vader_label = "neutral"  # Ensure consistency with manual labels

    return vader_label == label

df['vader_match'] = df.apply(lambda row: sentiment_match(row['vader_score'], row['Sentiment']), axis=1)

match_rate = df['vader_match'].mean()
print(f"VADER matches manual labels {match_rate*100:.2f}% of the time.")


df['Sentiment'] = df['Sentiment'].str.strip().str.lower()  # Remove spaces & lowercase
candidate_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
df['Sentiment'] = df['Sentiment'].map(candidate_mapping)
df['Sentiment'] = df['Sentiment'].fillna(1).astype(int)
df = pd.get_dummies(df, columns=['Candidate'], prefix='Candidate')

VADER matches manual labels 45.80% of the time.


In [193]:
########################################
# STEP 1: Process the dataset
########################################
import numpy as np
import pandas as pd
from textblob import TextBlob

# 1. Linguistic & Lexical Features
df['avg_word_length'] = df['fullText'].apply(lambda x: np.mean([len(w) for w in x.split()]) if x.split() else 0)
df['n_unique_words'] = df['fullText'].apply(lambda x: len(set(x.split())))
df['text_length'] = df['fullText'].apply(len)
df['word_count'] = df['fullText'].apply(lambda x: len(x.split()))
df['n_negations'] = df['fullText'].str.count(r'\b(not|no|never|don\'t|doesn\'t|can\'t|won\'t|isn\'t|ain\'t)\b')

# 2. Tone & Emotion Features
df['textblob_polarity'] = df['fullText'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['n_emojis'] = df['fullText'].str.count(r'[^\w\s,]')

# 3. Political Lexicon Features
support_words = ['vote', 'support', 'endorse', 'leader', 'elect', '💙']
attack_words = ['fraud', 'corrupt', 'criminal', 'lie', 'scandal', 'rigged', 'communist']

df['n_support_words'] = df['fullText'].apply(lambda x: sum(word in x.lower() for word in support_words))
df['n_attack_words'] = df['fullText'].apply(lambda x: sum(word in x.lower() for word in attack_words))

# 4. Engagement Tier (optional — if engagement_rate exists)
if 'engagement_rate' in df.columns:
    df['engagement_tier'] = pd.qcut(df['engagement_rate'], q=4, labels=False, duplicates='drop')  # 0 = low, 3 = viral

# 5. Optional: Direction-Sentiment Conflict Feature
# (Only useful if you have 'direction' labels and want to model indirect/direct)

# Done! Check shape and sample output
print("Enhanced df shape:", df.shape)
df.head(3)


df_model = df.drop(columns = ['vader_match', 'X', 'candidate_mentioned'])

print(f"df_model.shape={df_model.shape}")

df_model.head(5)


Enhanced df shape: (500, 32)
df_model.shape=(500, 29)


Unnamed: 0,fullText,day,July,August,September,October,November,likes,retweets,views,...,avg_word_length,n_unique_words,text_length,word_count,n_negations,textblob_polarity,n_emojis,n_support_words,n_attack_words,engagement_tier
0,i think what people aren’t taking about enough...,14,0,1,0,0,0,2.5e-05,1e-05,2.3e-05,...,4.454545,31,179,33,1,-0.216667,2,1,0,0
1,i can’t believe trump is really launching a pr...,13,0,0,0,1,0,0.000208,5.2e-05,0.000139,...,4.777778,9,51,9,0,0.2,1,0,1,0
2,trump on the radio show sid & friends in the m...,7,0,0,0,1,0,2.4e-05,6.2e-05,5.4e-05,...,4.207547,48,275,53,1,0.389286,11,1,0,0
3,most of these trump supporting men are lonely ...,21,0,0,1,0,0,2.1e-05,1e-05,3e-06,...,4.451613,30,169,31,0,-0.091071,0,1,0,2
4,i think weve all confused jerry brown with wil...,9,0,1,0,0,0,0.000178,0.000146,7.1e-05,...,4.357143,36,224,42,0,-0.277778,2,0,1,1


In [194]:
########################################
# STEP 2: Apply "non-learned" data transformations
########################################
vectorizer = TfidfVectorizer(max_features=500)
X_model_tfidf = vectorizer.fit_transform(df_model['fullText'])

# Step 3: Convert to DataFrame
tfidf_train_df = pd.DataFrame(X_model_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Step 4: Remove 'fullText' and 'sentiment' before merging
df_model_features = df_model.drop(columns=['fullText', 'Sentiment', 'direction', 'id'])

# Step 5: Ensure alignment by resetting index
x_train = pd.concat([tfidf_train_df.reset_index(drop=True), df_model_features.reset_index(drop=True)], axis=1)

# Step 6: Define Target Variable
y_train = df_model['Sentiment'].reset_index(drop=True)

# Step 7: Verify Final Data
print("Final x_train shape:", x_train.shape)  # Should match (90, 1018) if 18 + 1000 features
print("Final y_train shape:", y_train.shape)
print(x_train.head())  # or print(df_train.head()) to check your dataset
print(y_train.head())

Final x_train shape: (500, 525)
Final y_train shape: (500,)
    10      2016  2020  2024  2025  abortion     about  absolutely  actually  \
0  0.0  0.269745   0.0   0.0   0.0       0.0  0.137932         0.0       0.0   
1  0.0  0.000000   0.0   0.0   0.0       0.0  0.000000         0.0       0.0   
2  0.0  0.000000   0.0   0.0   0.0       0.0  0.000000         0.0       0.0   
3  0.0  0.000000   0.0   0.0   0.0       0.0  0.157115         0.0       0.0   
4  0.0  0.000000   0.0   0.0   0.0       0.0  0.000000         0.0       0.0   

   administration  ...  avg_word_length  n_unique_words  text_length  \
0             0.0  ...         4.454545              31          179   
1             0.0  ...         4.777778               9           51   
2             0.0  ...         4.207547              48          275   
3             0.0  ...         4.451613              30          169   
4             0.0  ...         4.357143              36          224   

   word_count  n_negations

In [195]:
########################################
# STEP 3: Create train/test sets
########################################
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

X_train, X_test, Y_train, Y_test = train_test_split(
    x_train, y_train, test_size=0.2, random_state=43, stratify=y_train  # Maintains class balance
)

# Further split training into train/validation
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.15, random_state=43, stratify=Y_train
)

# Print new distributions
print("New Training Class Distribution:\n", Y_train.value_counts(normalize=True))
print("\nNew Validation Class Distribution:\n", Y_val.value_counts(normalize=True))
print("\nNew Test Class Distribution:\n", Y_test.value_counts(normalize=True))

# Print final shapes
print(f"Training set: X_train={X_train.shape}, Y_train={Y_train.shape}")
print(f"Validation set: X_val={X_val.shape}, Y_val={Y_val.shape}")
print(f"Test set: X_test={X_test.shape}, Y_test={Y_test.shape}")


New Training Class Distribution:
 Sentiment
-1    0.582353
 1    0.244118
 0    0.173529
Name: proportion, dtype: float64

New Validation Class Distribution:
 Sentiment
-1    0.583333
 1    0.250000
 0    0.166667
Name: proportion, dtype: float64

New Test Class Distribution:
 Sentiment
-1    0.59
 1    0.24
 0    0.17
Name: proportion, dtype: float64
Training set: X_train=(340, 525), Y_train=(340,)
Validation set: X_val=(60, 525), Y_val=(60,)
Test set: X_test=(100, 525), Y_test=(100,)


In [196]:
standardize = sklearn.preprocessing.StandardScaler(
    with_mean=True,
    with_std=True,
    )
standardize.fit(x_train) # all of these feature transformations have; => "learned"
x_train = standardize.transform(x_train)
X_train = standardize.transform(X_train)
X_test = standardize.transform(X_test)
X_val = standardize.transform(X_val)
print(f"x_train.shape={x_train.shape}")

# scale the data to a finite range
scaler = sklearn.preprocessing.MaxAbsScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)
print(f"x_train.shape={x_train.shape}")

x_train.shape=(500, 525)
x_train.shape=(500, 525)


In [197]:
########################################
# STEP 5: Train a model
########################################

model1 = sklearn.tree.DecisionTreeClassifier(
   criterion='entropy',
   max_depth=2,
   min_samples_split=30,
   min_samples_leaf=3,
   max_features=None,
   max_leaf_nodes=5,
   random_state=42)
 
sentiment_model = sklearn.ensemble.AdaBoostClassifier(
   estimator=model1,
   n_estimators=55)


sentiment_model.fit(X_train, Y_train)

# most of our discussions in class about "error"
# accuracy is just 1 - error

# Report accuracy scores
validation_accuracy = sentiment_model.score(X_val, Y_val)
print(f"Validation Accuracy: {validation_accuracy:.4f}")

train_accuracy = sentiment_model.score(X_train, Y_train)
print(f"Train Accuracy: {train_accuracy:.4f}")

y_train_pred = sentiment_model.predict(X_train)
print("\nTrain Set Metrics:")
print(classification_report(Y_train, y_train_pred))

y_val_pred = sentiment_model.predict(X_val)
print("\nValidation Set Metrics:")
print(classification_report(Y_val, y_val_pred))

Validation Accuracy: 0.8000
Train Accuracy: 0.9853

Train Set Metrics:
              precision    recall  f1-score   support

          -1       0.99      0.98      0.99       198
           0       1.00      1.00      1.00        59
           1       0.95      0.99      0.97        83

    accuracy                           0.99       340
   macro avg       0.98      0.99      0.99       340
weighted avg       0.99      0.99      0.99       340


Validation Set Metrics:
              precision    recall  f1-score   support

          -1       0.81      0.86      0.83        35
           0       1.00      1.00      1.00        10
           1       0.62      0.53      0.57        15

    accuracy                           0.80        60
   macro avg       0.81      0.80      0.80        60
weighted avg       0.79      0.80      0.80        60





In [198]:
########################################
# STEP 6: Evaluate on test set
########################################

# WARNING:
# this code should be run only once;
# after the hyperparameters have been decided based on the validation performance,
# then the False can be changed to True to run this code
if True:
    sentiment_model.fit(X_train, Y_train)
    y_test_pred = sentiment_model.predict(X_test)

    test_accuracy = sentiment_model.score(X_test, Y_test)
    print(f"Test Accuracy: {test_accuracy:.4f}")

    print("Test Report:")
    print(classification_report(Y_test, y_test_pred))
    print("Unique predictions in y_test_pred:", np.unique(y_test_pred))






Test Accuracy: 0.8100
Test Report:
              precision    recall  f1-score   support

          -1       0.82      0.86      0.84        59
           0       1.00      1.00      1.00        17
           1       0.62      0.54      0.58        24

    accuracy                           0.81       100
   macro avg       0.81      0.80      0.81       100
weighted avg       0.80      0.81      0.81       100

Unique predictions in y_test_pred: [-1  0  1]
