### Number of max_features --> tfidf with trigrams

In [1]:
import dagshub
import mlflow
mlflow.set_tracking_uri("https://dagshub.com/aryan0147/Capstone-Project-2.mlflow")

dagshub.init(repo_owner='aryan0147', repo_name='Capstone-Project-2', mlflow=True)

In [3]:
# Set or create an experiment
mlflow.set_experiment("Exp 2-TfIdf Trigram max_features")

2025/02/10 19:15:50 INFO mlflow.tracking.fluent: Experiment with name 'Exp 2-TfIdf Trigram max_features' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/a0a48281a0664d59b86076b6ab46d49a', creation_time=1739195152338, experiment_id='9', last_update_time=1739195152338, lifecycle_stage='active', name='Exp 2-TfIdf Trigram max_features', tags={}>

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [5]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Step 1: Function to log metrics and artifacts
def log_metrics_and_artifacts(y_test, y_pred, max_features, model, vectorizer):
    # Log accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    # Log classification report
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix: TF-IDF Trigrams, max_features={max_features}")
    confusion_matrix_path = f"confusion_matrix_{max_features}.png"
    plt.savefig(confusion_matrix_path)
    mlflow.log_artifact(confusion_matrix_path)
    plt.close()

    # Log the model
    mlflow.sklearn.log_model(model, f"random_forest_model_tfidf_trigrams_{max_features}")

# Step 2: Function to run the experiment
def run_experiment_tfidf_max_features(max_features):
    ngram_range = (1, 3)  # Trigram setting

    # Step 3: Split the data
    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    # Step 4: Define and train a Random Forest model with a Pipeline
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"TFIDF_Trigrams_max_features_{max_features}")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with TF-IDF Trigrams, max_features={max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Create a pipeline
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)),
            ('model', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42))
        ])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Make predictions
        y_pred = pipeline.predict(X_test)

        # Log metrics and artifacts
        log_metrics_and_artifacts(y_test, y_pred, max_features, pipeline, pipeline.named_steps['tfidf'])

# Step 5: Test various max_features values
max_features_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

for max_features in max_features_values:
    run_experiment_tfidf_max_features(max_features)



🏃 View run TFIDF_Trigrams_max_features_1000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/4005c63b707c4402bcb4c98fc853e83f
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_2000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/94114990f4374688984f0d42e3d195ad
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_3000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/31f7b73a51b148dd98fb6b80d1b8b883
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_4000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/3ba5929904e74b639a16909f633bf859
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_5000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/b0255c2075c54040adb1c2f7793fa51c
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_6000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/be1e7527d95748aba19965efee65ba39
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_7000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/82e6a7e86b0b45d6bf04535164366694
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_8000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/f4ea4fd0dc1d42ab848efee64c1819e7
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_9000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/1c8c1b614845473cbe10cb25e218fdb9
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9




🏃 View run TFIDF_Trigrams_max_features_10000 at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9/runs/ac88fc7220584b1aa1a8fc8a574c7348
🧪 View experiment at: https://dagshub.com/aryan0147/Capstone-Project-2.mlflow/#/experiments/9
