In [None]:
import mlflow

mlflow.set_tracking_uri("ADD YOUR MLFLOW TRACKING URI FROM EC2")

with mlflow_start_run():
  mlflow.log_params("param1", 15)
  mlflow.low_metric("metric1", 0.89)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [None]:
df.dropna(inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df = df[~(df['clean_comment'].str.strip() == '')]

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Define the preprocessing function

def preprocess_comment(comment):
  # Convert to lower case
  comment = comment.lower()

  # Remove the trail and lead whitespace
  comment = comment.strip()

  # Remove newline character
  comment = re.sub(r'\n',' ', comment)

  # Remove non-alphanumeric characters , except punctuation
  comment = re.sub(r'[^A-Za-z0-9]\s?.,','',comment)

  # Remove stopwords and keep essential one for sentiment analyzer
  stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
  comment = " ".join([word for word in comment.split() if word not in stop_words])

  # Lemmatizer(bring to root form)
  lemmatizer = WordNetLemmatizer()
  comment = " ".join([lemmatizer.lemmatize(word) for word in comment.split()])

  return comment

In [None]:
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [None]:
# !pip install mlflow

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,cross_val_predict,StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 1.> vectorize the comment using Bag of words [BOW model with 10000 words ]
vectorizer = CountVectorizer(max_features=10000)

In [None]:
# 2.> split data

X = vectorizer.fit_transform(df['clean_comment']).toarray()
y = df['category']

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X.shape

(36793, 10000)

In [None]:
y

Unnamed: 0,category
0,1
1,1
2,-1
3,0
4,1
...,...
37244,0
37245,1
37246,0
37247,1


In [None]:
y.shape

(36793,)

In [None]:
# set mlflow tracking server

mlflow.set_tracking_uri(" YOUR URI")

In [None]:
# create an experiment
mlflow.set_experiment("RF Baseline")


In [None]:
# goto aws create a new iam role
# iam -> users -> create user -> interview-user-2 -> attach policy -> give admin access -> create user -> extract the credential
# extract the access key id , secret access key.

In [None]:
# to configure to AWS

!pip intall boto3
!pip install awscli

In [None]:
!aws configure

In [None]:
# 1.> split data into train & test

X_train , X_test , y_train, y_test = train_test_split(X , y ,test_size = 0.2 ,random_state=42 , stratify=y)

# 2.> define and train RF baseline

with mlflow.start_run() as run:
  # Logging decscription for run
  mlflow.set_tag("mlflow.runName","RandomForest_Baseline_TrainTestSplit")
  mlflow.set_tag("experiment_type","Baseline")
  mlflow.set_tag("model_type","RandomForestClassifier")

  # ADD description
  mlflow.set_tag("description", "Baseline RandomForest model for sentiment analysis using Bag of Words (BoW) with a simple train-test split")

  # Log params for the vectorizer
  mlflow.log_param("vectorizer_type","CountVectorizer")
  mlflow.log_param("vectorizer_max_feature",vectorizer.max_features)

  # Log RF params
  n_estimators = 200
  max_depth = 15

  # initialize & train the model

  model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=42)
  model.fit(X_train,y_train)

  # predict on test set
  y_pred = model.predict(X_test)

  # Log metric for each class & accuracy
  accuracy = accuracy_score(y_test,y_pred)

  classific_rep = classification_report(y_test,y_pred,output_dict = True)

  for labels,metrics in classific_rep.items():
    if isinstance(metrics,dict):
      for metric, value in metrics.items():
        mlflow.log_metric(f"{label}_{metric}",value)

  # CNF Matrix plot

  conf_matrix = confusion_matrix(y_test,y_pred)
  plt.figure(figsize=(10,6))
  sns.heatmap(conf_matrix,annot=True , fmt="d",cmap="Blues")
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.title("Confusion Matrix")


  # save & log CM

  plt.savefig("confusion_matrix.png")
  mlflow.log_artifact("/content/confusion_matrix.png")

  # Log the RF model

  mlflow.sklearn.log_metric(model,"random_forest_model")


  # log the dataset only if it's small
  df.to_csv("dataset.csv",index=False)
  mlflow.log_artifacts("/content/dataset.csv")

# final accuracy
print(f"Accuracy: {accuracy}")

In [None]:
print(classification_report(y_test,y_pred))

Recall 0: means in -ve[-1 classa] class koi bhi class ke dt pt ko -ve ni bta paya h ya neutral ya +ve bol rh h jo eldam galat h

In [None]:
df.to_csv('reddit_preprocessed.csv',index=False)

In [None]:
pd.read_csv('reddit_preprocessed.csv').head()