# Data Ingestion

In [227]:
import pandas as pd
import numpy as np

In [228]:
df = pd.read_csv("https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [229]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [230]:
df.isna().sum()

clean_comment    100
category           0
dtype: int64

In [231]:
df = df.dropna(axis = 0, how = "any")

In [232]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37149 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37149 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 870.7+ KB


In [233]:
df.isna().sum()

clean_comment    0
category         0
dtype: int64

In [234]:
X = df.drop("category", axis = 1)
y = df["category"]

In [235]:
X["clean_comment"] = X["clean_comment"].astype(str).str.strip().str.lower()
X = X[X["clean_comment"] != ""]
y = y.loc[X.index]  

In [236]:
assert X_train.shape[0] == y_train.shape[0]

In [237]:
# return X, y

# Experimenting

In [271]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [239]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [240]:
preprocessing = ColumnTransformer([
    ("comments", TfidfVectorizer(), "clean_comment")  # no need for list here
])

In [241]:
Models = {}

In [242]:
# KNC Model

KNC = Pipeline([
    ("preprocessing", preprocessing),
    ("model", KNeighborsClassifier())
])
KNC.fit(X_train, y_train)
Models["KNC"] = KNC

In [243]:
# NBC Model

NBC = Pipeline([
    ("preprocessing", preprocessing),
    ("model", MultinomialNB())
])
NBC.fit(X_train, y_train)
Models["NBC"] = NBC

In [244]:
# RFC Model

RFC = Pipeline([
    ("preprocessing", preprocessing),
    ("model", RandomForestClassifier())
])
RFC.fit(X_train, y_train)
Models["RFC"] = RFC

In [245]:
df.to_csv("reddit.csv")

In [246]:
import mlflow
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [247]:
mlflow.set_experiment("E2E_DVC")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for model_name, model in Models.items():
    with mlflow.start_run(run_name = model_name):
        # log params
        model_params = model.get_params()
        mlflow.log_params(model_params)

        # log metrics
        pred_vals = model.predict(X_test)
        mlflow.log_metrics({
            "accuracy": accuracy_score(y_test, pred_vals),
            "precision": precision_score(y_test, pred_vals, average = "macro"),
            "recall": recall_score(y_test, pred_vals, average = "macro")
        })

        # log model
        mlflow.sklearn.log_model(model, artifact_path = model_name)

        # log artifacts
        mlflow.log_artifacts("reddit.csv")



🏃 View run KNC at: http://127.0.0.1:5000/#/experiments/720447753069128798/runs/2a6892e4c9f34368a02be8b12c3e28b8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/720447753069128798




🏃 View run NBC at: http://127.0.0.1:5000/#/experiments/720447753069128798/runs/6eba743bb38c40c1b951c689f9512a5e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/720447753069128798




🏃 View run RFC at: http://127.0.0.1:5000/#/experiments/720447753069128798/runs/214ab7f6feaa47868614efb85221518f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/720447753069128798


# Model Evaluation

In [248]:
import mlflow

In [249]:
client = mlflow.MlflowClient()

In [254]:
exp = client.get_experiment_by_name("E2E_DVC")
exp_id = exp.experiment_id
artifact_loc = exp.artifact_location

In [267]:
best_run = client.search_runs(
    experiment_ids = [exp_id],
    max_results = 1,
    order_by = ["metrics.accuracy DESC"]
)[0]

# print(best_run)
run_id = best_run.info.run_id
run_name = best_run.info.run_name
run_uri = f"runs:/{run_id}/{run_name}"

In [269]:
model_name = "E2E_DVC"

result = mlflow.register_model(run_uri, model_name)
new_version = result.version

try:
    current_champion = client.get_model_version_by_alias(model_name, "@champion")
    old_version = current_champion.version

    old_run = client.get_run(current_champion.run_id)
    old_accuracy = old_run.data.metrics.get("accuracy", 0.0)
    new_accuracy = best_run.data.metrics.get("accuracy", 0.0)

    if new_accuracy > old_accuracy:
        # Promote new version to champion
        client.set_registered_model_alias(model_name, "@champion", new_version)
        client.set_registered_model_alias(model_name, "@challenger", old_version)
        print(f"Promoted v{new_version} to @champion (↑ {new_accuracy:.4f})")
    else:
        # Keep current champion, register new as challenger
        client.set_registered_model_alias(model_name, "@challenger", new_version)
        print(f"Retained v{old_version} as @champion (↑ {old_accuracy:.4f}), new v{new_version} as @challenger")

except Exception:
        # No current champion exists—promote new version
        client.set_registered_model_alias(model_name, "@champion", new_version)
        print(f"First champion set: v{new_version} (↑ {best_run.data.metrics.get('accuracy', 0.0):.4f})")


MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/registered-models/create failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/registered-models/create (Caused by ResponseError('too many 500 error responses'))

In [274]:
X_train

Unnamed: 0,clean_comment
34954,let keep politics shit out this sub shall
20021,beat civil servants instead
1883,fukkin money
35227,really wonder what the loopholes are gonna don...
8555,discount diplomatic correspondence its finest
...,...
16943,nov begins antifa more like nov began and ain ...
6300,sextape actress will leak and wil amazing quality
11342,the team has been given the free encyclopedia ...
866,this comment has been overwritten open source ...
