In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Conv1D, MaxPooling1D, Flatten
import tensorflow as tf
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

olid_data = pd.read_csv("/content/drive/MyDrive/PG_Project_JU /Datasets/OLID.csv")
cyber_troll = pd.read_csv("/content/drive/MyDrive/PG_Project_JU /Datasets/Cyber-Troll.csv")


In [11]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [15]:
olid_data['cleaned_text'] = olid_data['Text'].apply(preprocess_text)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Prepare X and y
X = olid_data['cleaned_text']
y = olid_data['Label']

# ========================
# Feature Extraction
# ========================

# BoW
bow_vectorizer = CountVectorizer(max_features=10000)
X_bow = bow_vectorizer.fit_transform(X)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

# BoW + TF-IDF (TF-IDF transformed BoW)
tfidf_transformer = TfidfTransformer()
X_tfidf_bow = tfidf_transformer.fit_transform(X_bow)

# ========================
# Train-Test Split
# ========================

X_bow_train, X_bow_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_tfidf_train, X_tfidf_test, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_tfidf_bow_train, X_tfidf_bow_test, _, _ = train_test_split(X_tfidf_bow, y, test_size=0.2, random_state=42)

# ========================
# Define Models
# ========================

models = {
    "SVM + linear": SVC(kernel='linear', probability=True, random_state=42),
    "SVM + rbf": SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Naive Bayes": MultinomialNB()
}

# ========================
# Train and Evaluate Models
# ========================

def train_evaluate_model(X_train, X_test, y_train, y_test, feature_type):
    for model_name, model in models.items():
        print(f"\n==== {model_name} with {feature_type} ====")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred, digits=2))
        acc = accuracy_score(y_test, y_pred)
        print(f"Test Accuracy: {acc}")

# ========================
# Run Experiments
# ========================

# 1. Using BoW
train_evaluate_model(X_bow_train, X_bow_test, y_train, y_test, "BoW")

# 2. Using TF-IDF
train_evaluate_model(X_tfidf_train, X_tfidf_test, y_train, y_test, "TF-IDF")

# 3. Using TF-IDF transformed BoW
train_evaluate_model(X_tfidf_bow_train, X_tfidf_bow_test, y_train, y_test, "TF-IDF on BoW")



==== SVM + linear with BoW ====
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      1733
           1       0.66      0.57      0.61       915

    accuracy                           0.75      2648
   macro avg       0.73      0.71      0.71      2648
weighted avg       0.74      0.75      0.75      2648

Test Accuracy: 0.7507552870090635

==== SVM + rbf with BoW ====
              precision    recall  f1-score   support

           0       0.71      0.97      0.82      1733
           1       0.84      0.26      0.40       915

    accuracy                           0.73      2648
   macro avg       0.78      0.62      0.61      2648
weighted avg       0.76      0.73      0.68      2648

Test Accuracy: 0.7280966767371602

==== Logistic Regression with BoW ====
              precision    recall  f1-score   support

           0       0.78      0.89      0.83      1733
           1       0.72      0.53      0.61       915

    accurac