In [1]:
import os
import pandas as pd

folder_path = r"C:\Users\raned\Documents\Github\POSTMODERATION\TrainingData"
print("Files in TrainingData folder:")
print(os.listdir(folder_path))

file_path = os.path.join(folder_path, "combined_dataset_emoji.csv")
training_data_df = pd.read_csv(file_path)




Files in TrainingData folder:
['combined_dataset.csv', 'combined_dataset_emoji.csv', 'combined_dataset_old.csv', 'HateSpeechDatasetBalanced.csv', 'labeled_data.csv']


In [2]:
# 1. Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import emoji
import re

# Ensure no NaNs exist in CleanContent
training_data_df['CleanContent'] = training_data_df['CleanContent'].fillna('')

# Now split safely
X_train, X_test, y_train, y_test = train_test_split(
    training_data_df['CleanContent'], 
    training_data_df['Label'], 
    test_size=0.2, 
    random_state=42
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# 6. Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'class_weight': [None, 'balanced'],
    'solver': ['liblinear'],
}

grid = GridSearchCV(LogisticRegression(max_iter=2000), param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid.fit(X_train_tfidf, y_train)

# 7. Evaluate
best_log_reg = grid.best_estimator_
y_pred = best_log_reg.predict(X_test_tfidf)

print("Best Params:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Not Malicious', 'Malicious']))



Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params: {'C': 1, 'class_weight': None, 'solver': 'liblinear'}
Accuracy: 0.8000669792364367
Classification Report:
                precision    recall  f1-score   support

Not Malicious       0.80      0.79      0.79      2909
    Malicious       0.80      0.81      0.81      3063

     accuracy                           0.80      5972
    macro avg       0.80      0.80      0.80      5972
 weighted avg       0.80      0.80      0.80      5972



In [3]:
# 1. Define new texts
new_texts = [
    "Fuck this place.", 
    "What a beautiful day, feeling grateful!",
    "I hate it here.",
    "You're a piece of shit",
    "Dumb ass bitch",
    "you’re such a dumbass 🤡 nobody wants you around 💩", #testing malicious tweets with emojis
    "go back to your country 🖕",
    "You're such a clown 🤡 lol",
    "I hate you"
]

# 2. Transform new texts with the same TF-IDF vectorizer
new_texts_tfidf = vectorizer.transform(new_texts)

# 3. Predict using the trained logistic regression model
predictions = best_log_reg.predict(new_texts_tfidf)

# 4. Print results with readable labels
labels = ['Not Malicious', 'Malicious']
for text, pred in zip(new_texts, predictions):
    print(f"Text: {text}\nPrediction: {labels[pred]}\n")


Text: Fuck this place.
Prediction: Malicious

Text: What a beautiful day, feeling grateful!
Prediction: Not Malicious

Text: I hate it here.
Prediction: Not Malicious

Text: You're a piece of shit
Prediction: Malicious

Text: Dumb ass bitch
Prediction: Malicious

Text: you’re such a dumbass 🤡 nobody wants you around 💩
Prediction: Malicious

Text: go back to your country 🖕
Prediction: Malicious

Text: You're such a clown 🤡 lol
Prediction: Malicious

Text: I hate you
Prediction: Malicious

