In [1]:
# libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler

import sys
sys.path.append('../scripts')
import functions as f

manually labelled 400.

In [2]:
reddit = pd.read_csv(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/train/reddit_labeled.csv"
)

In [3]:
print(reddit['label'].value_counts())

label
neutral     222
positive    158
negative     47
Name: count, dtype: int64


In [4]:
# relationships between the label columns
print(pd.crosstab(reddit["label"], reddit["label_h"], normalize='index'))
print(pd.crosstab(reddit["label"], reddit["label_k"], normalize='index'))

label_h          0         1         2
label                                 
negative  0.978723  0.021277  0.000000
neutral   0.837838  0.157658  0.004505
positive  0.955696  0.031646  0.012658
label_k          0         1         2
label                                 
negative  0.595745  0.042553  0.361702
neutral   0.310811  0.256757  0.432432
positive  0.588608  0.044304  0.367089


In [5]:
# labels as factors
reddit = f.reddit_dtypes(reddit)

# check work
print(reddit.dtypes)

subreddit    category
post_id        object
text           object
score           int64
year         category
month        category
day          category
label_h      category
label_k      category
label        category
dtype: object


subsample to balance classes. apply cross validation for stronger performance. Random forest can use small datasets, but if the sample is too small to adequetely capture larger ptterns. RF can underperform.

In [6]:
# separate test and train
train = reddit.dropna(subset=['label'])
test = reddit[reddit['label'].isna()]

X_train = train[['label_h', 'label_k']] 
y_train = train["label"]

X_test = test[["label_h", "label_k"]]

In [7]:
# encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

In [8]:
# undersample  to reduse majority classes
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [9]:
# build model and predict
clf = RandomForestClassifier(random_state=42)
cv_scores = cross_val_score(clf, X_resampled, y_resampled, 
                            cv=5,
                            scoring='accuracy')

print(f"Mean CV accuracy: {np.mean(cv_scores)}")

Mean CV accuracy: 0.3830049261083744


In [10]:
clf.fit(X_resampled, y_resampled)

predicted = clf.predict(X_test)

predicted_labels = encoder.inverse_transform(predicted)

reddit.loc[reddit["label"].isna(), "label_rf"] = predicted_labels


In [11]:
reddit['label_rf'] = reddit['label_rf'].fillna(reddit['label'])

In [12]:
reddit.to_csv(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/train/reddit_predicted.csv",
    index=False,
)