In [12]:
import pandas as pd
import joblib
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import os

In [13]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jahag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jahag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
# Load datasets
train_file_path = "Racial_train.csv"  # Update with correct file name
val_file_path = "Racial_val.csv"

In [15]:
# Ensure the dataset exists and columns are correct
df_train = pd.read_csv(train_file_path)
df_val = pd.read_csv(val_file_path)

In [16]:
# Check for missing values and columns
print(df_train.isnull().sum())
print(df_val.isnull().sum())

print(df_train.columns)
print(df_val.columns)

clean_text    0
labels        0
dtype: int64
clean_text    0
labels        0
dtype: int64
Index(['clean_text', 'labels'], dtype='object')
Index(['clean_text', 'labels'], dtype='object')


In [17]:
# Preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words("english")]
    return " ".join(words)

In [18]:
# Apply preprocessing
df_train["clean_text"] = df_train["clean_text"].apply(preprocess_text)
df_val["clean_text"] = df_val["clean_text"].apply(preprocess_text)

In [19]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
X_train = vectorizer.fit_transform(df_train["clean_text"])
X_val = vectorizer.transform(df_val["clean_text"])
y_train = df_train["labels"]
y_val = df_val["labels"]

In [36]:
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [21]:
# Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)



In [22]:
# Ensure directory for saving models exists
if not os.path.exists("models"):
    os.makedirs("models")

In [28]:
# Define models and parameters for grid search
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced"),
    "SVM": SVC(class_weight="balanced"),
    "Random Forest": RandomForestClassifier(class_weight="balanced"),
    "MLP": MLPClassifier()
}

In [29]:
param_grid = {
    "Logistic Regression": {"C": [0.1, 1, 10]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Random Forest": {"n_estimators": [100, 200], "max_depth": [10, 20]},
    "MLP": {"hidden_layer_sizes": [(50,), (100,)], "alpha": [0.0001, 0.001]}
}

In [30]:
# Cross-validation and model training
best_model = None
best_f1 = 0

In [31]:
for name, model in models.items():
    print(f"Training {name}...")
    grid = GridSearchCV(model, param_grid[name], scoring="f1_macro", cv=StratifiedKFold(n_splits=5), n_jobs=-1)
    grid.fit(X_train, y_train)
    best_estimator = grid.best_estimator_
    
    # Save best model using joblib
    joblib.dump(best_estimator, f"models/{name.replace(' ', '_').lower()}_model.pkl")
    
    # Predict on validation data
    y_pred = best_estimator.predict(X_val)
    
    # Evaluate model
    f1 = f1_score(y_val, y_pred, average="macro")
    acc = accuracy_score(y_val, y_pred)
    
    print(f"{name} - F1 Score: {f1:.4f}, Accuracy: {acc:.4f}")
    
    # Print Classification Report
    print(f"Classification Report for {name}:\n", classification_report(y_val, y_pred))


Training Logistic Regression...
Logistic Regression - F1 Score: 0.6592, Accuracy: 0.7581
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84       774
           1       0.50      0.45      0.48       247

    accuracy                           0.76      1021
   macro avg       0.67      0.65      0.66      1021
weighted avg       0.75      0.76      0.75      1021

Training SVM...
SVM - F1 Score: 0.6335, Accuracy: 0.7395
Classification Report for SVM:
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       774
           1       0.46      0.42      0.44       247

    accuracy                           0.74      1021
   macro avg       0.64      0.63      0.63      1021
weighted avg       0.73      0.74      0.74      1021

Training Random Forest...
Random Forest - F1 Score: 0.5840, Accuracy: 0.7669
Classification Report for Random Forest:
  

In [32]:
###test

In [37]:
import pandas as pd
import joblib

In [38]:
# Load test data (without labels)
test_file_path = "Racial_test_without_labels.csv"
df_test = pd.read_csv(test_file_path)

In [39]:
# Load saved TF-IDF vectorizer
vectorizer = joblib.load("tfidf_vectorizer.pkl")

In [40]:

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text

In [41]:
# Apply preprocessing
df_test["clean_text"] = df_test["clean_text"].apply(preprocess_text)

In [42]:
# Convert test text to numerical features
X_test = vectorizer.transform(df_test["clean_text"]).toarray()

In [51]:
# List of trained models
model_names = [
    "Logistic_Regression_model",
    "svm_model",
    "random_forest_model",
    "mlp_model"
]

In [52]:
for model_name in model_names:
    # Load the model
    model = joblib.load(f"{model_name}.pkl")
    
    # Predict class labels
    y_pred = model.predict(X_test)
    
    # Save predictions in a separate CSV file
    output_file = f"{model_name}_predictions.csv"
    df_output = pd.DataFrame({"clean_text": df_test["clean_text"], "Predicted_Label": y_pred})
    df_output.to_csv(output_file, index=False)
    
    print(f"Predictions saved for {model_name} in {output_file}")

Predictions saved for Logistic_Regression_model in Logistic_Regression_model_predictions.csv
Predictions saved for svm_model in svm_model_predictions.csv
Predictions saved for random_forest_model in random_forest_model_predictions.csv
Predictions saved for mlp_model in mlp_model_predictions.csv
