###### Solving/Dealing the class imbalance dt (specially for -1 categories)

In [2]:
!pip install mlflow boto3 awscli

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.35.64-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.36.5-py3-none-any.whl.metadata (11 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Collecting botocore<1.36.0,>=1.35.64 (from boto3)
  Downloading botocore-1.35.64-p

In [None]:
!aws configure

In [4]:
import mlflow

# set/create an experiment
mlflow.set_tracking_uri("YOUR_TRACKING_URI")

In [5]:
# set an experiment
mlflow.set_experiment("Exp 5 Handling imbalanced data")

2024/11/19 17:15:04 INFO mlflow.tracking.fluent: Experiment with name 'Handling imbalanced data' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://interview-mlflow-bucket/335851565471750917', creation_time=1732036504741, experiment_id='335851565471750917', last_update_time=1732036504741, lifecycle_stage='active', name='Handling imbalanced data', tags={}>

In [6]:
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import mlflow
import mlflow.sklearn
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv("https://raw.githubusercontent.com/ashishshukla888/influencer_sentiment_analysis/refs/heads/master/data.csv").dropna(subset=['clean_comment'])
df.head()
df.shape

(37149, 2)

In [13]:
# S1 :- Func to solve test various imbalance technique

def run_imbalanced_experiment(imbalance_method):
  ngram_range = (1,3) # trigram
  max_features = 10000

  # S2 :- split data
  X_train, X_test, y_train, y_test =  train_test_split(df['clean_comment'],df['category'],test_size = 0.2,random_state = 42,stratify=df['category'])

  # S3 :- vectorization using TF-IDF , fit on training dt

  vectorizer = TfidfVectorizer(ngram_range=ngram_range,max_features=max_features)
  # fit on train & test data
  X_train_vec = vectorizer.fit_transform(X_train)
  X_test = vectorizer.transform(X_test)


  # S4 :- Handle class imbalane
  if imbalance_method == 'class_weights':
    # use class_weight in RF
    class_weight = 'balanced'
  else:

    class_weight = None # Don't use class_weight if using resampling

    if imbalance_method == 'oversampling':
      smote = SMOTE(random_state=42)
      X_train_vec,y_train = smote.fit_resample(X_train_vec, y_train)
    elif imbalance_method == 'adasyn':
      adasyn = ADASYN(random_state=42)
      X_train_vec,y_train = adasyn.fit_resample(X_train_vec,y_train)
    elif imbalance_method == 'undersampling':
      rus = RandomUnderSampler(random_state=42)
      X_train_vec,y_train = rus.fit_resample(X_train_vec,y_train)
    elif imbalance_method == 'smote_enn':
      smote_enn = SMOTEENN(random_state=42)
      X_train_vec ,y_train = smote_enn.fit_resample(X_train_vec,y_train)


    # S5 :- Define & train RF model
  with mlflow.start_run() as run:
    # set tag for experiment & run
    mlflow.set_tag("mlflow.runName",f"Imbalance_{imbalance_method}_RandomForest_TFIDF_Trigrams")
    mlflow.set_tag("experiment_type","imabalance_handling")
    mlflow.set_tag("model_type","RandomForestClassifier")

    # Add description
    mlflow.set_tag("description",f"RandomForest with TF-IDF Trigrams, imbalance handling method = {imbalance_method}")

    # Log vectorizer params
    mlflow.log_param("vectorizer_type","TF-IDF")
    mlflow.log_param("ngram_range",ngram_range)
    mlflow.log_param("vectorizer max feature",max_features)

    # Log RF params
    n_estimators = 200
    max_depth = 15

    mlflow.log_param("n_estimators",n_estimators)
    mlflow.log_param("max_depth",max_depth)
    mlflow.log_param("imbalance method",imbalance_method)

    # init & train model
    model = RandomForestClassifier(n_estimators = n_estimators,max_depth = max_depth, random_state=42 , class_weight=class_weight)
    model.fit(X_train_vec,y_train)

    # make prediction
    y_pred = model.predict(X_test)

    # log accuracy
    accuracy = accuracy_score(y_test,y_pred)
    mlflow.log_metric("accuracy",accuracy)

    # log classification report

    classification_rep = classification_report(y_test,y_pred,output_dict=True)
    for label,metrics in classification_rep.items():
      if isinstance(metrics,dict):
        for metric,value in metrics.items():
          mlflow.log_metric(f"{label}_{metric}",value)

    # log Cmatrix
    conf_mat = confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(conf_mat,annot=True,fmt="d",cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"confusion_matrix: TF-IDF Trigrams , Imbalance={imbalance_method}")
    confusion_matrix_filename = f"confusion_matrix_{imbalance_method}.png"
    plt.savefig(confusion_matrix_filename)
    mlflow.log_artifact(confusion_matrix_filename)
    plt.close()

    # Log model
    mlflow.sklearn.log_model(model,f"random_forest_model_tfidf_trigrams_imbalance_{imbalance_method}")

# Run experiment for diff imb methods
imbalance_methods = ['class_weight','oversampling','undersampling','adasyn','smote_enn']

for method in imbalance_methods:
  run_imbalanced_experiment(method)



🏃 View run Imbalance_class_weight_RandomForest_TFIDF_Trigrams at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917/runs/13a0b082a7b241348730de542bbb7657
🧪 View experiment at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917




🏃 View run Imbalance_oversampling_RandomForest_TFIDF_Trigrams at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917/runs/31f0ab8261d340369a96f1058f4bbc80
🧪 View experiment at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917




🏃 View run Imbalance_undersampling_RandomForest_TFIDF_Trigrams at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917/runs/4be39b3429384ae9b6ff05b3101ce50f
🧪 View experiment at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917




🏃 View run Imbalance_adasyn_RandomForest_TFIDF_Trigrams at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917/runs/99adcad5417742b7b1646d37165afc73
🧪 View experiment at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917




🏃 View run Imbalance_smote_enn_RandomForest_TFIDF_Trigrams at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917/runs/436355db6416499c81857a3516658830
🧪 View experiment at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/335851565471750917
