In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

In [28]:
data = pd.read_csv('Untitled.ipynb.csv');

In [29]:
sentences = data['text']

In [30]:
data.count()

text     9414
label    9414
dtype: int64

In [31]:
labels = data['label']

In [32]:
label[0]

np.int64(0)

In [33]:
# Split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2)

# Vectorize
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [34]:

# Train model
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1667
           1       0.96      0.39      0.56       216

    accuracy                           0.93      1883
   macro avg       0.94      0.70      0.76      1883
weighted avg       0.93      0.93      0.91      1883



In [35]:
clf = LogisticRegression(class_weight="balanced", max_iter=200)
clf.fit(X_train_tfidf, y_train)
# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95      1667
           1       0.59      0.79      0.67       216

    accuracy                           0.91      1883
   macro avg       0.78      0.86      0.81      1883
weighted avg       0.93      0.91      0.92      1883



In [37]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

clf = LogisticRegression(max_iter=200)
clf.fit(X_resampled, y_resampled)
# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96      1667
           1       0.64      0.77      0.70       216

    accuracy                           0.92      1883
   macro avg       0.80      0.86      0.83      1883
weighted avg       0.93      0.92      0.93      1883



In [38]:
y_probs = clf.predict_proba(X_test_tfidf)[:,1]
y_pred_adjusted = (y_probs >= 0.3).astype(int)  # threshold = 0.3
print(classification_report(y_test, y_pred_adjusted))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92      1667
           1       0.46      0.86      0.60       216

    accuracy                           0.87      1883
   macro avg       0.72      0.86      0.76      1883
weighted avg       0.92      0.87      0.89      1883



In [39]:
#using n grams

In [40]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=50000)

In [41]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [42]:
#logistic regression + ngrams
# Train model
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1667
           1       0.93      0.35      0.51       216

    accuracy                           0.92      1883
   macro avg       0.92      0.67      0.73      1883
weighted avg       0.92      0.92      0.91      1883



In [44]:
#balanced logistic regression + ngrams"