In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\anime\OneDrive\Desktop\Data Science Projects\Youtube Comment Analysis\youtube_comment_analyser\data\processed\reddit_preprocessing.csv")
df

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1
...,...,...
36788,jesus,0
36789,kya bhai pure saal chutiya banaya modi aur jab...,1
36790,downvote karna tha par upvote hogaya,0
36791,haha nice,1


In [2]:
df =  df.dropna(subset=["clean_comment"])
df.shape

(36661, 2)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
import optuna
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("ML Algos with Revised HP Tuning")

<Experiment: artifact_location='s3://datascienceanimesh/516785590309530379', creation_time=1743657286448, experiment_id='516785590309530379', last_update_time=1743657286448, lifecycle_stage='active', name='ML Algos with Revised HP Tuning', tags={}>

In [5]:
import mlflow
import optuna
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Remove rows where 'category' is NaN
df = df.dropna(subset=['category'])

# Step 2: Train-test split BEFORE TF-IDF vectorization
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# Step 3: TF-IDF vectorization applied only on training data
ngram_range = (1, 2)  # Use bigram features
max_features = 1000  # Limit vocabulary size

vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train = vectorizer.fit_transform(X_train_raw)  # Fit on training data
X_test = vectorizer.transform(X_test_raw)  # Transform test data separately

# Step 4: Apply undersampling to balance the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_Undersampling_TFIDF_Bigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")


# Step 6: Optuna objective function for SVM using cross-validation
def objective_svm(trial):
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)  # Regularization parameter
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])

    model = SVC(C=C, kernel=kernel, random_state=42)

    # Use cross-validation on resampled training data
    cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    return np.mean(cv_scores)  # Return mean accuracy of cross-validation


# Step 7: Run Optuna for SVM, log the best model only
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_svm, n_trials=30)

    # Get the best parameters and log only the best model
    best_params = study.best_params
    best_model = SVC(C=best_params['C'], kernel=best_params['kernel'], random_state=42)

    # Train the best model on the resampled training set
    best_model.fit(X_train_resampled, y_train_resampled)

    # Log the best model using MLflow
    log_mlflow("SVM", best_model, X_train_resampled, X_test, y_train_resampled, y_test)

# Run the experiment for SVM
run_optuna_experiment()


[I 2025-04-03 11:39:26,707] A new study created in memory with name: no-name-079f14f2-6287-4ef7-a189-88cd897e48f7
[I 2025-04-03 11:44:46,384] Trial 0 finished with value: 0.7567951492694 and parameters: {'C': 6.102577205639687, 'kernel': 'rbf'}. Best is trial 0 with value: 0.7567951492694.
[I 2025-04-03 11:46:43,047] Trial 1 finished with value: 0.7658380155053165 and parameters: {'C': 7.044013958924162, 'kernel': 'linear'}. Best is trial 1 with value: 0.7658380155053165.
[I 2025-04-03 11:49:25,234] Trial 2 finished with value: 0.3525349843475206 and parameters: {'C': 0.00027444133372347724, 'kernel': 'linear'}. Best is trial 1 with value: 0.7658380155053165.
[I 2025-04-03 11:51:04,627] Trial 3 finished with value: 0.7693240633113976 and parameters: {'C': 2.7392919527762563, 'kernel': 'linear'}. Best is trial 3 with value: 0.7693240633113976.
[I 2025-04-03 11:57:21,113] Trial 4 finished with value: 0.7583612395931467 and parameters: {'C': 4.241297486039209, 'kernel': 'rbf'}. Best is tr

🏃 View run SVM_Undersampling_TFIDF_Bigrams at: http://ec2-16-171-237-43.eu-north-1.compute.amazonaws.com:5000/#/experiments/516785590309530379/runs/6429de792c664da38033449f9e6949d3
🧪 View experiment at: http://ec2-16-171-237-43.eu-north-1.compute.amazonaws.com:5000/#/experiments/516785590309530379
