In [29]:
# libraries
import pandas as pd
import os
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from tokenizers import Tokenizer

import sys
sys.path.append("../scripts")
import functions as f

.csv with all manual labels applied

In [30]:
reddit = pd.read_csv(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/master/reddit_labelled-sample.csv"
)
reddit = reddit.dropna(subset=['label'])
reddit = f.reddit_dtypes(reddit)

# Remove the word 'deleted' from the 'text' column
reddit['text'] = reddit['text'].str.replace('deleted', '', regex=False)

# Save the dataframe as a pickle object
pickle_folder = "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/pickle"
pickle_path_reddit = os.path.join(pickle_folder, "reddit_labelled-sample.pkl")
joblib.dump(reddit, pickle_path_reddit)

reddit.to_csv(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/master/reddit_labelled-sample.csv",
    index=False,
)

In [31]:
# features and target
X = reddit["text"].apply(f.token_and_lemmatize_nb)
y = reddit["label"]

print(f"Length of X: {len(X)}, Length of y: {len(y)}")

Length of X: 1598, Length of y: 1598


In [32]:
# vectorize
vectorizer = TfidfVectorizer(max_features=1000)
X_vectorized = vectorizer.fit_transform(X)

print(f"X shape: {X_vectorized.shape}")

X shape: (1598, 1000)


In [33]:
# Export X_vectorized as a pickle object
pickle_path = os.path.join(pickle_folder, "X_vectorized.pkl")
joblib.dump(X_vectorized, pickle_path)

['/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/pickle/X_vectorized.pkl']

In [34]:
# encode target
encoder = LabelEncoder()
y = encoder.fit_transform(y)
print(f"Classes: {encoder.classes_}")

Classes: ['negative' 'neutral' 'positive']


In [35]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)
print(f"Train shape: {X_train.shape}", f"Test shape: {X_test.shape}")
print("Training set class distribution:")
print(pd.Series(y_train).value_counts(normalize=True))
print("Test set class distribution:")
print(pd.Series(y_test).value_counts(normalize=True))

Train shape: (1278, 1000) Test shape: (320, 1000)
Training set class distribution:
1    0.418623
2    0.404538
0    0.176839
Name: proportion, dtype: float64
Test set class distribution:
1    0.41875
2    0.40625
0    0.17500
Name: proportion, dtype: float64


In [36]:
# initialize
nb = MultinomialNB()

# parameter grid
param_grid = {
    "alpha": [0.001, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0]
}

# grid search w/ 5-fold cross-validation
grid_search = GridSearchCV(estimator=nb, 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring="accuracy")

# fit grid search
grid_search.fit(X_train, y_train)

# best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'alpha': 0.5}
Best score: 0.6392830882352941


In [37]:
# train model with best parameters
custom_nb = grid_search.best_estimator_
custom_nb.fit(X_train, y_train)

# predict
y_pred = custom_nb.predict(X_test)

# decode
custom_nb_pred = encoder.inverse_transform(y_pred)

y_test_decoded = encoder.inverse_transform(y_test)

accuracy = accuracy_score(y_test_decoded, custom_nb_pred)
print(f"Test set accuracy: {accuracy}")

# Detailed performance metrics
print("Classification Report:")
print(classification_report(y_test_decoded, custom_nb_pred, 
                            target_names=encoder.classes_,
                            zero_division=0))

Test set accuracy: 0.621875
Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.09      0.15        56
     neutral       0.68      0.70      0.69       134
    positive       0.58      0.77      0.66       130

    accuracy                           0.62       320
   macro avg       0.59      0.52      0.50       320
weighted avg       0.61      0.62      0.58       320



Test if stratified sampling creates a stronger model

In [38]:
# Stratified sampling to keep an even distribution of levels in the 'label' column
min_count = reddit["label"].value_counts().min()
reddit_stratified = reddit.groupby("label", observed=True).apply(
    lambda x: x.sample(min_count, random_state=42).reset_index(drop=True)
)
print(reddit_stratified["label"].value_counts())

# features and target
X_strat = reddit_stratified["text"].apply(f.token_and_lemmatize_nb)
y_strat = reddit_stratified["label"]

print(f"Length of X: {len(X_strat)}, Length of y: {len(y_strat)}")

# vectorize
X_strat_vectorized = vectorizer.fit_transform(X_strat)

print(f"X shape: {X_strat_vectorized.shape}")

y_strat = encoder.fit_transform(y_strat)

# split data
X_strat_train, X_strat_test, y_strat_train, y_strat_test = train_test_split(
    X_strat_vectorized, y_strat, 
    test_size=0.2,
    random_state=572, 
    stratify=y_strat
)

# grid search w/ 5-fold cross-validation
grid_search_strat = GridSearchCV(
    estimator=nb, 
    param_grid=param_grid, 
    cv=5, 
    scoring="accuracy"
)

# fit grid search
grid_search_strat.fit(X_strat_train, y_strat_train)

# best parameters and score
print(f"Best parameters: {grid_search_strat.best_params_}")
print(f"Best score: {grid_search_strat.best_score_}")

# train model with best parameters
strat_nb = grid_search_strat.best_estimator_
strat_nb.fit(X_strat_train, y_strat_train)

# predict
y_strat_pred = strat_nb.predict(X_strat_test)

# decode
strat_pred = encoder.inverse_transform(y_strat_pred)

y_strat_test_decoded = encoder.inverse_transform(y_strat_test)

accuracy = accuracy_score(y_strat_test_decoded, strat_pred)
print(f"Stratified set accuracy: {accuracy}")

# Detailed performance metrics
print("Classification Report:")
print(
    classification_report(
        y_strat_test_decoded, strat_pred, 
        target_names=encoder.classes_, 
        zero_division=0
    )
)

label
negative    282
neutral     282
positive    282
Name: count, dtype: int64


  reddit_stratified = reddit.groupby("label", observed=True).apply(


Length of X: 846, Length of y: 846
X shape: (846, 1000)
Best parameters: {'alpha': 2.0}
Best score: 0.578355119825708
Stratified set accuracy: 0.6
Classification Report:
              precision    recall  f1-score   support

    negative       0.53      0.62      0.57        56
     neutral       0.72      0.58      0.64        57
    positive       0.59      0.60      0.59        57

    accuracy                           0.60       170
   macro avg       0.61      0.60      0.60       170
weighted avg       0.61      0.60      0.60       170



Because stratifying reduces sample size so much, adding it does not strengthen the model by adding more balanced target to train on.

In [39]:
# Apply VADER to test set
reddit_with_vader = f.vader_analysis(reddit)

Test set accuracy: 0.36
Classification Report:
              precision    recall  f1-score   support

    negative       0.20      0.47      0.28       282
     neutral       0.41      0.12      0.18       669
    positive       0.49      0.57      0.53       647

    accuracy                           0.36      1598
   macro avg       0.37      0.39      0.33      1598
weighted avg       0.40      0.36      0.34      1598



Moving forward with my customized naive bayes!

In [40]:
# Save the model
models_folder = "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models"
model_path = os.path.join(models_folder, "custom_nb_model.pkl")
joblib.dump(custom_nb, model_path)

# Save the encoder
encoder_path = os.path.join(models_folder, "nb_label_encoder.pkl")
joblib.dump(encoder, encoder_path)

['../models/nb_label_encoder.pkl']