In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

df = pd.read_csv('processed_data.csv')

X = df['text']
y = df['target']

vectorizer = CountVectorizer()

X_vectorized = vectorizer.fit_transform(X)

print("Vectorized data shape: ", X_vectorized.shape)
print("Number of classes: ", len(y.unique()))

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

models = {
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(max_depth=50, min_samples_split=4, criterion='gini', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=100, min_samples_split=4, criterion='entropy'),
    "Logistic Regression": LogisticRegression(C=0.1, random_state=42, max_iter=1000),
    "SVM Linear": SVC(kernel='linear', C=1.0, random_state=42),
    "SVM Non-linear": SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
}

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = {model_name: {'accuracy': [], 'precision': [],
                        'recall': [], 'f1': []} for model_name in models.keys()}

for _ in range(20):
    X_train, X_test, y_train, y_test = train_test_split(
        X_vectorized, y, test_size=0.2, shuffle=True)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        if model_name == "Logistic Regression":
            y_prob = model.predict_proba(X_test)[:, 1]
            y_pred = (y_prob >= 0.5).astype(int)
        else:
            y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(
            y_test, y_pred, average='binary', zero_division=0)
        recall = recall_score(
            y_test, y_pred, average='binary', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='binary', zero_division=0)

        metrics[model_name]['accuracy'].append(accuracy)
        metrics[model_name]['precision'].append(precision)
        metrics[model_name]['recall'].append(recall)
        metrics[model_name]['f1'].append(f1)

mean_metrics = {model_name: {metric: np.mean(values) for metric, values in model_metrics.items()}
                for model_name, model_metrics in metrics.items()}

for model_name, mean in mean_metrics.items():
    print(f"{model_name} Mean Metrics:")
    print(f"  Accuracy: {mean['accuracy']:.4f}")
    print(f"  Precision: {mean['precision']:.4f}")
    print(f"  Recall: {mean['recall']:.4f}")
    print(f"  F1 Score: {mean['f1']:.4f}\n")