In [5]:
import pandas as pd

df = pd.read_csv(r"C:\Users\anime\OneDrive\Desktop\Data Science Projects\Youtube Comment Analysis\youtube_comment_analyser\data\processed\reddit_preprocessing.csv")
df

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1
...,...,...
36788,jesus,0
36789,kya bhai pure saal chutiya banaya modi aur jab...,1
36790,downvote karna tha par upvote hogaya,0
36791,haha nice,1


In [6]:
df =  df.dropna(subset=["clean_comment"])
df.shape

(36661, 2)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [None]:
import mlflow 

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("Bow vs Tfidf")

2025/03/16 13:27:33 INFO mlflow.tracking.fluent: Experiment with name 'Bow vs Tfidf' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://datascienceanimesh/826535339336548848', creation_time=1742111853100, experiment_id='826535339336548848', last_update_time=1742111853100, lifecycle_stage='active', name='Bow vs Tfidf', tags={}>

In [8]:
def run_experiment(vectorizer_type,ngram_range,vectorizer_max_features,vectorizer_name):
    if vectorizer_name == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range,max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range,max_features=vectorizer_max_features)

    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RandomForest")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        mlflow.set_tag("description", f"RandomForest with {vectorizer_name}, ngram_range={ngram_range}, max_features={vectorizer_max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", vectorizer_max_features)

        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        # Step 5: Make predictions and log metrics
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: {vectorizer_name}, {ngram_range}")
        plt.savefig(f"Confusion Matrix: {vectorizer_name}, {ngram_range}.png")
        mlflow.log_artifact(f"Confusion Matrix: {vectorizer_name}, {ngram_range}.png")
        plt.close()

        mlflow.sklearn.log_model(model, f"random_forest_model_{vectorizer_name}_{ngram_range}")

# Step 6: Run experiments for BoW and TF-IDF with different n-grams
ngram_ranges = [(1, 1), (1, 2), (1, 3)]  # unigrams, bigrams, trigrams
max_features = 5000  # Example max feature size

for ngram_range in ngram_ranges:
    # BoW Experiments
    run_experiment("BoW", ngram_range, max_features, vectorizer_name="BoW")

    # TF-IDF Experiments
    run_experiment("TF-IDF", ngram_range, max_features, vectorizer_name="TF-IDF")
    



🏃 View run BoW_(1, 1)_RandomForest at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848/runs/4b6826961a854dc69c77910d57623c07
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848




🏃 View run TF-IDF_(1, 1)_RandomForest at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848/runs/a9deb4baf67d41809d8c460d78fd9a35
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848




🏃 View run BoW_(1, 2)_RandomForest at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848/runs/a3f7b8e2acda4814b88bba7fa575d06e
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848




🏃 View run TF-IDF_(1, 2)_RandomForest at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848/runs/51580451f7fa401eae1938cdf294708e
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848




🏃 View run BoW_(1, 3)_RandomForest at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848/runs/155a8c4c7134426cace22ba5c1a6582b
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848




🏃 View run TF-IDF_(1, 3)_RandomForest at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848/runs/465b2bb9b8dc46a9a1a92b9b12a68698
🧪 View experiment at: http://ec2-13-49-64-251.eu-north-1.compute.amazonaws.com:5000/#/experiments/826535339336548848
