###### TRYING RANDOM FOREST

In [1]:
!pip install mlflow boto3 awscli optuna imbalanced-learn

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.35.65-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.36.6-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Col

In [1]:
# !aws configure

In [3]:
import mlflow
# S2 :- Set up the MLflow tracking server
mlflow.set_tracking_uri("http://ec2-54-205-141-180.compute-1.amazonaws.com:5000/")

In [4]:
# Set or Create an experiment
mlflow.set_experiment("EXP 5 - ML Algo with HP tuning")

<Experiment: artifact_location='s3://interview-mlflow-bucket/851239149715883194', creation_time=1732038163076, experiment_id='851239149715883194', last_update_time=1732038163076, lifecycle_stage='active', name='EXP 5 - ML Algo with HP tuning', tags={}>

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
df = pd.read_csv("/content/reddit_preprocessed.csv").dropna()
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [10]:
# S1 :- Remapping (Skipping it)

# S2 :- drop nan in category
df = df.dropna(subset=['category'])

# S3 :- set/create vectorizer
ngram_range = (1,3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range,max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

# S4 :-
smote = SMOTE(random_state=42)
X_resampled,y_resampled = smote.fit_resample(X,y)

# S5 :- split the data
X_train,X_test ,y_train,y_test = train_test_split(X_resampled,y_resampled, test_size = 0.2,random_state = 42, stratify=y_resampled)

# S6 :- Func to log result in mlflow
def log_mlflow(model_name,model,X_trian,X_test,y_train,y_test):
  # log the model type
  mlflow.set_tag("mlflow.runName",f"{model_name}_SMOTE_TFIDF_Trigrams")
  mlflow.set_tag("experiment_type","alogorithm comparision")

  # log model name as a param
  mlflow.log_param("algo_name",model_name)

  # train model
  model.fit(X_train,y_train)
  y_pred = model.predict(X_test)

  # Log accuracy
  accuracy = accuracy_score(y_test,y_pred)
  mlflow.log_metric("accuracy",accuracy)

  # Log Classification report
  classification_rep = classification_report(y_test,y_pred,output_dict=True)
  for label,metrics in classification_rep.items():
    if isinstance(metrics,dict):
      for metric,value in metrics.items():
        mlflow.log_metric(f"{label}_{metric}",value)

  # log the model
  mlflow.sklearn.log_model(model,f"{model_name}_model")

# S6 :- optuna objective function for Random Forest
def objective_rf(trial):
  n_estimators = trial.suggest_int("n_estimators",50,300)
  max_depth = trial.suggest_int("max_depth",3,20)
  min_samples_split = trial.suggest_int("min_samples_split",2,20)
  min_samples_leaf = trial.suggest_int("min_samples_leaf",1,20)

  # RF setup
  model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split,random_state=42)

  return accuracy_score(y_test,model.fit(X_train,y_train).predict(X_test))

def run_optuna_experiment():
  study = optuna.create_study(direction="maximize")
  study.optimize(objective_rf,n_trials=5)

  # Get the best params & log the best model
  best_params = study.best_params
  best_model = RandomForestClassifier(n_estimators = best_params["n_estimators"],
                                      max_depth = best_params["max_depth"],
                                      min_samples_split = best_params["min_samples_split"],
                                      min_samples_leaf = best_params["min_samples_leaf"],
                                      random_state=42)
  log_mlflow("RandomForest",best_model,X_train,X_test,y_train,y_test)

# run experiment
run_optuna_experiment()

[I 2024-11-20 07:27:12,272] A new study created in memory with name: no-name-cfcf4b92-03af-4b31-bcc9-63ca981b799d
[I 2024-11-20 07:27:14,522] Trial 0 finished with value: 0.6495455506235468 and parameters: {'n_estimators': 79, 'max_depth': 8, 'min_samples_split': 14, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.6495455506235468.
[I 2024-11-20 07:27:31,804] Trial 1 finished with value: 0.7108433734939759 and parameters: {'n_estimators': 295, 'max_depth': 20, 'min_samples_split': 13, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.7108433734939759.
[I 2024-11-20 07:27:39,229] Trial 2 finished with value: 0.6959416613823716 and parameters: {'n_estimators': 190, 'max_depth': 17, 'min_samples_split': 8, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.7108433734939759.
[I 2024-11-20 07:27:41,614] Trial 3 finished with value: 0.6516592686535616 and parameters: {'n_estimators': 87, 'max_depth': 8, 'min_samples_split': 14, 'min_samples_leaf': 9}. Best is trial 1 with value: