In [221]:
########################################
# STEP 0: import libraries
########################################import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer 
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
import sklearn.tree

In [222]:
###reading in initial###
df = pd.read_csv(("C:\\Users\\agrey25\\Downloads\\train_data.csv"))
df.head(1)

Unnamed: 0,X,fullText,day,Harris,Trump,July,August,September,October,November,likes,retweets,views,comments,engagement_rate,id,Candidate,direction,Sentiment
0,6232,i think what people aren’t taking about enough...,14,0,1,0,1,0,0,0,2.5e-05,1e-05,2.3e-05,1.9e-05,0.020858,1,trump,indirect,negative


In [223]:
########################################
# SUB-STEP: Prepare the dataset
########################################
analyzer = SentimentIntensityAnalyzer()

def get_vader_score(text):
    return analyzer.polarity_scores(text)['compound']

df['vader_score'] = df['fullText'].apply(get_vader_score)


def sentiment_match(vader_score, label):
    if vader_score > 0.05:
        vader_label = "positive"
    elif vader_score < -0.05:
        vader_label = "negative"
    else:
        vader_label = "neutral"  # Ensure consistency with manual labels

    return vader_label == label

df['vader_match'] = df.apply(lambda row: sentiment_match(row['vader_score'], row['Sentiment']), axis=1)

match_rate = df['vader_match'].mean()
print(f"VADER matches manual labels {match_rate*100:.2f}% of the time.")


df['Candidate'] = df['Candidate'].str.strip().str.lower()  # Remove spaces & lowercase
candidate_mapping = {'trump': 1, 'harris': 2, 'neither': 3}
df['Candidate'] = df['Candidate'].map(candidate_mapping)
df['Candidate'] = df['Candidate'].fillna(1).astype(int)
df.head(5)

VADER matches manual labels 45.80% of the time.


Unnamed: 0,X,fullText,day,Harris,Trump,July,August,September,October,November,...,retweets,views,comments,engagement_rate,id,Candidate,direction,Sentiment,vader_score,vader_match
0,6232,i think what people aren’t taking about enough...,14,0,1,0,1,0,0,0,...,1e-05,2.3e-05,1.9e-05,0.020858,1,1,indirect,negative,-0.4404,True
1,21769,i can’t believe trump is really launching a pr...,13,0,1,0,0,0,1,0,...,5.2e-05,0.000139,0.000889,0.035637,2,3,neutral,neutral,0.0,True
2,20243,trump on the radio show sid & friends in the m...,7,0,1,0,0,0,1,0,...,6.2e-05,5.4e-05,5.7e-05,0.01193,3,1,indirect,negative,0.9423,False
3,16181,most of these trump supporting men are lonely ...,21,0,1,0,0,1,0,0,...,1e-05,3e-06,1.9e-05,0.117333,4,1,indirect,negative,-0.7003,True
4,4854,i think weve all confused jerry brown with wil...,9,0,1,0,1,0,0,0,...,0.000146,7.1e-05,0.000227,0.053209,5,3,neutral,neutral,0.0516,False


In [224]:
########################################
# STEP 1: Process the dataset
########################################

df_model = df.drop(columns = ['vader_match', 'X', 'direction', 'Sentiment', 'Harris', 'Trump'])

print(f"df_model.shape={df_model.shape}") 

df_model.shape=(500, 15)


In [225]:
########################################
# STEP 2: Apply "non-learned" data transformations
########################################
vectorizer = TfidfVectorizer(max_features=500)
X_model_tfidf = vectorizer.fit_transform(df_model['fullText'])

# Step 3: Convert to DataFrame
tfidf_train_df = pd.DataFrame(X_model_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Step 4: Remove 'fullText' and 'sentiment' before merging
df_model_features = df_model.drop(columns=['fullText', 'Candidate'])

# Step 5: Ensure alignment by resetting index
x_train = pd.concat([tfidf_train_df.reset_index(drop=True), df_model_features.reset_index(drop=True)], axis=1)

# Step 6: Define Target Variable
y_train = df_model['Candidate'].reset_index(drop=True)

# Step 7: Verify Final Data
print("Final x_train shape:", x_train.shape)  # Should match (90, 1018) if 18 + 1000 features
print("Final y_train shape:", y_train.shape)
print(x_train.head())  # or print(df_train.head()) to check your dataset
print(y_train.head())

Final x_train shape: (500, 513)
Final y_train shape: (500,)
    10      2016  2020  2024  2025  abortion     about  absolute  absolutely  \
0  0.0  0.269745   0.0   0.0   0.0       0.0  0.137932       0.0         0.0   
1  0.0  0.000000   0.0   0.0   0.0       0.0  0.000000       0.0         0.0   
2  0.0  0.000000   0.0   0.0   0.0       0.0  0.000000       0.0         0.0   
3  0.0  0.000000   0.0   0.0   0.0       0.0  0.157115       0.0         0.0   
4  0.0  0.000000   0.0   0.0   0.0       0.0  0.000000       0.0         0.0   

   actually  ...  September  October  November     likes  retweets     views  \
0       0.0  ...          0        0         0  0.000025  0.000010  0.000023   
1       0.0  ...          0        1         0  0.000208  0.000052  0.000139   
2       0.0  ...          0        1         0  0.000024  0.000062  0.000054   
3       0.0  ...          1        0         0  0.000021  0.000010  0.000003   
4       0.0  ...          0        0         0  0.000178  0

In [226]:
########################################
# STEP 3: Create train/test sets
########################################

validation_ratio = 0.25  # 25% of data for validation

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=validation_ratio, random_state=0
)

# Check shapes
print(f"Training Set Size: {X_train.shape}, {y_train.shape}")
print(f"Validation Set Size: {X_val.shape}, {y_val.shape}")

Training Set Size: (375, 513), (375,)
Validation Set Size: (125, 513), (125,)


In [227]:
########################################
# STEP 4: Apply "learned" data transformations
########################################

In [228]:
########################################
# STEP 5: Train a model
########################################

model = sklearn.tree.DecisionTreeClassifier(
    criterion='gini',
    max_depth=2,
    min_samples_split=2,
    min_samples_leaf=10,
    max_features=None,
    max_leaf_nodes=25,
    random_state=42,
    )
model = sklearn.ensemble.AdaBoostClassifier(
    estimator=model,
    n_estimators=50,
    )

model.fit(X_train, y_train)

# most of our discussions in class about "error"
# accuracy is just 1 - error

# report validation accuracy
validation_accuracy = model.score(X_val, y_val)
print(f"validation_accuracy={validation_accuracy:0.4f}")
train_accuracy = model.score(X_train, y_train)
print(f"train_accuracy={train_accuracy:0.4f}")

validation_accuracy=0.7520
train_accuracy=0.8987
