### TRYING WITH SVM

In [1]:
!pip install mlflow boto3 awscli optuna imbalanced-learn

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.35.64-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.36.5-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Col

In [2]:
!aws configure

AWS Access Key ID [None]: AKIA2L2SHHQ2FY4SIFJT
AWS Secret Access Key [None]: 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
Default region name [None]: 
Default output format [None]: 


In [3]:
# set mlflow tracking server
import mlflow
mlflow.set_tracking_uri("YOUR TRACKING URI")

In [4]:
# set/create experiment
mlflow.set_experiment("EXP 5 - ML Algo with HP tuning")

<Experiment: artifact_location='s3://interview-mlflow-bucket/851239149715883194', creation_time=1732038163076, experiment_id='851239149715883194', last_update_time=1732038163076, lifecycle_stage='active', name='EXP 5 - ML Algo with HP tuning', tags={}>

In [5]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
import optuna

In [6]:
df = pd.read_csv('/content/reddit_preprocessed.csv').dropna()
df.shape

(36750, 2)

In [None]:
# S1 -> remapping doesn't needed for SVM
# S2 -> remove nan
df = df.dropna(subset=['category'])
# S3 -> TF-IDF vectorizer
ngram_range = (1,3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range , max_features = max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

# S4 -> apply smote
smote = SMOTE(random_state=42)
X_resampled,y_resampled = smote.fit_resample(X,y)

# S5 -> split dt
X_train , X_test , y_train , y_test = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42,stratify=y_resampled)

# fucn to log result in mlflow
def log_mlflow(model_name,model,X_train,X_test,y_train,y_test):
  with mlflow.start_run():
    # log model type
    mlflow.set_tag("mlflow.runName",f"{model_name}_Smote_TFIDF_Trigrams")
    mlflow.set_tag("experiment_type","algorithm_comparision")

    #log algo name as param
    mlflow.log_param("algo_name",model_name)

    # train model
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    # log accuracy
    accuracy = accuracy_score(y_test,y_pred)
    mlflow.log_metric("accurcay",accuracy)

    # log classification report
    classification_rep = classification_report(y_test,y_pred,output_dict=True)
    for label,metrics in classification_rep.items():
      if isinstance(metrics,dict):
        for metric,value in metrics.items():
          mlflow.log_metric(f"{label}_{metric}",value)

    # log model
    mlflow.sklearn.log_model(model,f"{model_name}_model")


# S6 -> Optuna objective function for SVM
def objective_svm(trial):

  C = trial.suggest_float('C',1e-4,1e-1,log=True)
  kernel = trial.suggest_categorical('kernel',['linear','rbf','poly'])

  model = SVC(C=C,kernel=kernel,random_state=42)
  return accuracy_score(y_test,model.fit(X_train,y_train).predict(X_test))

# S7 -> Run optuna for svm ,log the best model only
def run_optuna_experiment():
  study = optuna.create_study(direction="maximize")
  study.optimize(objective_svm,n_trials=8)

  # get the best params and log the best model only
  best_params = study.best_params
  best_model = SVC(C=best_params['C'],kernel=best_params['kernel'],random_state=42)

  # log the best model with mlflow pass the algo name as svm
  log_mlflow("SVC",best_model,X_train,X_test,y_train,y_test)

run_optuna_experiment()

[I 2024-11-19 18:55:10,827] A new study created in memory with name: no-name-be994627-e1ae-4b8c-b563-dd989f009770
[I 2024-11-19 18:58:37,548] Trial 0 finished with value: 0.398858592263792 and parameters: {'C': 0.000740879907999297, 'kernel': 'rbf'}. Best is trial 0 with value: 0.398858592263792.
[I 2024-11-19 19:01:45,624] Trial 1 finished with value: 0.36355950116254493 and parameters: {'C': 0.00014255571331028093, 'kernel': 'poly'}. Best is trial 0 with value: 0.398858592263792.
[I 2024-11-19 19:05:02,648] Trial 2 finished with value: 0.41365461847389556 and parameters: {'C': 0.006366031176507922, 'kernel': 'rbf'}. Best is trial 2 with value: 0.41365461847389556.
[I 2024-11-19 19:08:08,044] Trial 3 finished with value: 0.4466286197421264 and parameters: {'C': 0.03186477918627452, 'kernel': 'poly'}. Best is trial 3 with value: 0.4466286197421264.
[I 2024-11-19 19:11:37,788] Trial 4 finished with value: 0.398858592263792 and parameters: {'C': 0.00042540307066820196, 'kernel': 'rbf'}. 