In [1]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [5]:
df_red = pd.read_csv('08_reddit.csv')
df_chunks_train = pd.read_csv('df_chunks_train.csv')
df_chunks_dev = pd.read_csv('df_chunks_dev.csv')
df_chunks_test = pd.read_csv('df_chunks_test.csv')


In [6]:
df_chunks_train.head(1)

Unnamed: 0,text,label,path,chunk_pos,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10
0,"ike we could go on forever, but I know we've got the both of us have something coming up in 15 minutes. "", ""What's what's you",1,/www.buzzsprout.com/ie/httpswww.buzzsprout.com1636013710918c19inriindiemediasexpandingroleinmainstreamnewscoveragewithconvergencerisrichardasinof.mp3MERGED,257,news,daily,politics,commentary,,,,,,


In [7]:
df_chunks = pd.concat([df_chunks_train, df_chunks_dev, df_chunks_test])


In [8]:
df_chunks = df_chunks.sample(frac=0.15, random_state=1).reset_index(drop=True)


In [9]:
df_chunks.shape

(26255, 14)

In [10]:
df_chunks = pd.concat([df_chunks, df_red])

In [12]:
df_chunks.shape

(39109, 23)

In [13]:
df_left = df_chunks[df_chunks['label']==0].sample(n=16000, random_state=1).reset_index(drop=True)


In [15]:
df_right = df_chunks[df_chunks['label']==1].reset_index(drop=True)


In [17]:
df_chunks = pd.concat([df_left, df_right])

In [18]:
df_chunks.shape

(27370, 23)

In [20]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_chunks, test_size=0.3, random_state=1)

In [21]:
X_train = list(df_train['text'])
y_train = list(df_train['label'])

X_test = list(df_test['text'])
y_test = list(df_test['label'])

In [22]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5)


In [23]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [34]:
clf = LogisticRegression(max_iter=500, class_weight = "balanced", random_state=1, C=3.0) #, C=2.0


In [35]:
clf.fit(X_train_vectorized, y_train)

In [36]:

y_pred_train = clf.predict(X_train_vectorized)
print("F1 score on the train set:", f1_score(y_train, y_pred_train, average='binary'))

y_pred_test = clf.predict(X_test_vectorized)
print("F1 score on the test set:", f1_score(y_test, y_pred_test, average='binary'))

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()

tpr = tp / (tp + fn)  # True Positive Rate (Recall)
fpr = fp / (fp + tn)  # False Positive Rate
tnr = tn / (tn + fp)  # True Negative Rate (Specificity)
fnr = fn / (fn + tp)  # False Negative Rate

print(f"True Positives: {tp}")
print(f"False Positives: {fp}")
print(f"True Negatives: {tn}")
print(f"False Negatives: {fn}")
print(f"True Positive Rate: {tpr:.4f}")
print(f"False Positive Rate: {fpr:.4f}")
print(f"True Negative Rate: {tnr:.4f}")
print(f"False Negative Rate: {fnr:.4f}")

F1 score on the train set: 0.8599520383693047
F1 score on the test set: 0.5771889400921659
True Positives: 2004
False Positives: 1601
True Negatives: 3271
False Negatives: 1335
True Positive Rate: 0.6002
False Positive Rate: 0.3286
True Negative Rate: 0.6714
False Negative Rate: 0.3998


In [43]:
term_counts = np.squeeze(np.asarray(X_test_vectorized.sum(axis=0)))

coefficients = clf.coef_[0]
impact_scores = coefficients * np.log(term_counts + 1)

feature_names = vectorizer.get_feature_names_out()

impact_scores_series = pd.Series(impact_scores, index=feature_names)

highest_impact = impact_scores_series.sort_values(ascending=False).head(100)
lowest_impact = impact_scores_series.sort_values(ascending=False).tail(100)

In [None]:
print("30 Highest Impact Terms:\n", highest_impact)
print("\n25 Lowest Impact Terms:\n", lowest_impact)