### TRYING WITH MultnomialNB

In [1]:
!pip install mlflow boto3 awscli optuna imbalanced-learn

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.35.64-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.36.5-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Col

In [2]:
!aws configure

AWS Access Key ID [None]: AKIA2L2SHHQ2FY4SIFJT
AWS Secret Access Key [None]: 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
Default region name [None]: 
Default output format [None]: 


In [3]:
import mlflow
# set tracking server
mlflow.set_tracking_uri("http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/")

In [4]:
# set/create an experiment
mlflow.set_experiment("EXP 5 - ML Algo with HP tuning")

<Experiment: artifact_location='s3://interview-mlflow-bucket/851239149715883194', creation_time=1732038163076, experiment_id='851239149715883194', last_update_time=1732038163076, lifecycle_stage='active', name='EXP 5 - ML Algo with HP tuning', tags={}>

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
import optuna

In [6]:
df = pd.read_csv('/content/reddit_preprocessed.csv').dropna()
df.shape

(36750, 2)

In [11]:
from math import log
# S1 :- Remapping is not needed since -1 category doesn't needed to be handled with MNB
# S2 :- removing nan from the target
df = df.dropna(subset = ['category'])

# S3 :- TFIDF Vectorizer
ngram_range = (1,3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range,max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

# S4 :- apply smote to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled,y_resampled = smote.fit_resample(X,y)

# S5 :- split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state = 42, stratify=y_resampled)

# Function to log model in Mlflow
def log_mlflow(model_name,model,X_train,X_test,y_train,y_test):
  with mlflow.start_run():
    # log the model type
    mlflow.set_tag("mlflow.runName",f"{model_name}_SMOTE_TFIDF_Trigrams")
    mlflow.set_tag("experiment_type","algorith_comparision")

    # Log algo name as a param
    mlflow.log_param("algo_name",model_name)

    # train model
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    # Log accuracy
    accuracy = accuracy_score(y_test,y_pred)
    mlflow.log_metric("accuracy",accuracy)

    # Log classification report

    classification_rep = classification_report(y_test,y_pred,output_dict=True)
    for label,metrics in classification_rep.items():
      if isinstance(metrics,dict):
        for metric,value in metrics.items():
          mlflow.log_metric(f"{label}_{metric}",value)

    # Log the model
    mlflow.sklearn.log_model(model,f"{model_name}_model")


# S6 :- Optuna objective function for MNB
def objective_mnb(trial):
  alpha = trial.suggest_float('alpha',1e-4,1.0,log=True) # tuning the soothing params

  # MultinomialNB model setup
  model = MultinomialNB(alpha=alpha)
  return accuracy_score(y_test,model.fit(X_train,y_train).predict(X_test))

# S7 :- Run optuna for MNB , log the best model
def run_optuna_experiment():
  study = optuna.create_study(direction="maximize")
  study.optimize(objective_mnb,n_trials=10)

  # get the best params & log only best model
  best_params = study.best_params
  best_model = MultinomialNB(alpha=best_params['alpha'])

  # log the best model with mlflow passing the alog name as "MultinomialNB"
  log_mlflow("MultinomialNB",best_model,X_train,X_test,y_train,y_test)

# Run the experiment for MNB
run_optuna_experiment()

[I 2024-11-19 19:27:21,828] A new study created in memory with name: no-name-2298a8b3-16ef-4515-b5b6-1bcbbe580101
[I 2024-11-19 19:27:21,854] Trial 0 finished with value: 0.6752272246882266 and parameters: {'alpha': 0.0739707328160093}. Best is trial 0 with value: 0.6752272246882266.
[I 2024-11-19 19:27:21,876] Trial 1 finished with value: 0.6735362502642147 and parameters: {'alpha': 0.1818416485610561}. Best is trial 0 with value: 0.6752272246882266.
[I 2024-11-19 19:27:21,896] Trial 2 finished with value: 0.6761783978017333 and parameters: {'alpha': 0.00024172599175603716}. Best is trial 2 with value: 0.6761783978017333.
[I 2024-11-19 19:27:21,915] Trial 3 finished with value: 0.6714225322342 and parameters: {'alpha': 0.7099200536478598}. Best is trial 2 with value: 0.6761783978017333.
[I 2024-11-19 19:27:21,933] Trial 4 finished with value: 0.6730078207567111 and parameters: {'alpha': 0.25451091374374785}. Best is trial 2 with value: 0.6761783978017333.
[I 2024-11-19 19:27:21,953] T

🏃 View run MultinomialNB_SMOTE_TFIDF_Trigrams at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/851239149715883194/runs/7691816f8b7a46a39883f7d617ddcf88
🧪 View experiment at: http://ec2-35-174-3-91.compute-1.amazonaws.com:5000/#/experiments/851239149715883194
