In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\anime\OneDrive\Desktop\Data Science Projects\Youtube Comment Analysis\youtube_comment_analyser\data\processed\reddit_preprocessing.csv")
df

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1
...,...,...
36788,jesus,0
36789,kya bhai pure saal chutiya banaya modi aur jab...,1
36790,downvote karna tha par upvote hogaya,0
36791,haha nice,1


In [2]:
df =  df.dropna(subset=["clean_comment"])
df.shape

(36661, 2)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
import os
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("ML Algos with Revised HP Tuning")

2025/04/03 10:44:35 INFO mlflow.tracking.fluent: Experiment with name 'ML Algos with Revised HP Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://datascienceanimesh/516785590309530379', creation_time=1743657286448, experiment_id='516785590309530379', last_update_time=1743657286448, lifecycle_stage='active', name='ML Algos with Revised HP Tuning', tags={}>

In [5]:
import mlflow
import optuna
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler

# Step 1: Remove rows where 'category' is NaN
df = df.dropna(subset=['category'])

# Step 2: Train-test split BEFORE TF-IDF vectorization
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# Step 3: TF-IDF vectorization applied only on training data
ngram_range = (1, 2)  # Bigram
max_features = 1000  # Set max_features to 1000

vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train = vectorizer.fit_transform(X_train_raw)  # Fit only on training data
X_test = vectorizer.transform(X_test_raw)  # Transform test data separately

# Step 4: Apply RandomUnderSampler to balance classes by undersampling majority classes
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_RandomUnderSampler_TFIDF_Bigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

# Step 6: Optuna objective function for KNN using cross-validation
def objective_knn(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 3, 30)  # Tuning the number of neighbors
    p = trial.suggest_categorical('p', [1, 2])  # Tuning the distance metric (1 for Manhattan, 2 for Euclidean)

    model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p)

    # Use cross-validation on training data
    cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    return np.mean(cv_scores)  # Return mean accuracy of cross-validation

# Step 7: Run Optuna for KNN, log the best model only
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_knn, n_trials=30)

    # Get the best parameters and log only the best model
    best_params = study.best_params
    best_model = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], p=best_params['p'])

    # Train the best model on the resampled training set
    best_model.fit(X_train_resampled, y_train_resampled)

    # Log the best model using MLflow
    log_mlflow("KNN", best_model, X_train_resampled, X_test, y_train_resampled, y_test)

# Run the experiment for KNN
run_optuna_experiment()


[I 2025-04-03 10:44:43,077] A new study created in memory with name: no-name-c1eb4fb4-1934-4f4e-90f1-facb1aeada79
[I 2025-04-03 10:44:47,947] Trial 0 finished with value: 0.4061316084612095 and parameters: {'n_neighbors': 4, 'p': 2}. Best is trial 0 with value: 0.4061316084612095.
[I 2025-04-03 10:44:52,696] Trial 1 finished with value: 0.39016870880032206 and parameters: {'n_neighbors': 18, 'p': 2}. Best is trial 0 with value: 0.4061316084612095.
[I 2025-04-03 10:44:55,694] Trial 2 finished with value: 0.4263411054771743 and parameters: {'n_neighbors': 6, 'p': 1}. Best is trial 2 with value: 0.4263411054771743.
[I 2025-04-03 10:45:01,159] Trial 3 finished with value: 0.39213896711122254 and parameters: {'n_neighbors': 17, 'p': 2}. Best is trial 2 with value: 0.4263411054771743.
[I 2025-04-03 10:45:05,558] Trial 4 finished with value: 0.36692944520649445 and parameters: {'n_neighbors': 24, 'p': 1}. Best is trial 2 with value: 0.4263411054771743.
[I 2025-04-03 10:45:09,994] Trial 5 fini

üèÉ View run KNN_RandomUnderSampler_TFIDF_Bigrams at: http://ec2-16-171-237-43.eu-north-1.compute.amazonaws.com:5000/#/experiments/516785590309530379/runs/f5adf21c57d64087bba95b5bcf566a2a
üß™ View experiment at: http://ec2-16-171-237-43.eu-north-1.compute.amazonaws.com:5000/#/experiments/516785590309530379
