In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\anime\OneDrive\Desktop\Data Science Projects\Youtube Comment Analysis\youtube_comment_analyser\data\processed\reddit_preprocessing.csv")
df

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1
...,...,...
36788,jesus,0
36789,kya bhai pure saal chutiya banaya modi aur jab...,1
36790,downvote karna tha par upvote hogaya,0
36791,haha nice,1


In [2]:
df =  df.dropna(subset=["clean_comment"])
df.shape

(36661, 2)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [None]:
import mlflow

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("Tfidf Bigram Max Features")


2025/03/16 13:42:58 INFO mlflow.tracking.fluent: Experiment with name 'Tfidf Bigram Max Features' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://datascienceanimesh/601218744098978510', creation_time=1742112778121, experiment_id='601218744098978510', last_update_time=1742112778121, lifecycle_stage='active', name='Tfidf Bigram Max Features', tags={}>

In [6]:
def run_experiment_tfidf_max_features(max_features):
    ngram_range = (1, 2)  # Bigram setting

    # Step 2: Vectorization using TF-IDF with varying max_features
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Step 4: Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"TFIDF_Trigrams_max_features_{max_features}")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with TF-IDF Bigrams, max_features={max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        # Step 5: Make predictions and log metrics
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: TF-IDF Bigrams, max_features={max_features}")
        plt.savefig(f"Confusion Matrix: TF-IDF Bigrams, max_features={max_features}.png")
        mlflow.log_artifact(f"Confusion Matrix: TF-IDF Bigrams, max_features={max_features}.png")
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_tfidf_bigrams_{max_features}")

# Step 6: Test various max_features values
max_features_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

for max_features in max_features_values:
    run_experiment_tfidf_max_features(max_features)



🏃 View run TFIDF_Trigrams_max_features_1000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/727971ff1d7748ae87065c4c75df8a76
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_2000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/63431bf712f64c32b518df8c59e9e731
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_3000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/28f10aa3b9fb43f2ac2360581fdd8817
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_4000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/a2d7ba81d54e4ac5aab5913bb05392d4
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_5000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/639b8a25ab9243fdb41d484c7fb3a12d
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_6000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/4211191bcb2046179846f13fbfdfc7d5
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_7000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/6fc68287b56f4d50bf4c780486aa198e
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_8000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/120d3e271f3c426ab9ab977cbede06c7
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_9000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/60d31f75e3d2489d8c5aa53e32dcd09e
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510




🏃 View run TFIDF_Trigrams_max_features_10000 at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510/runs/4c1c4a8b81114fa89818a8edb4953295
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/601218744098978510
