In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import ExtraTreesClassifier
import pickle
import os
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [11]:
csv_file_path = "/Users/hwanghyejeong/Documents/boaz/lp-patchnote/user_1_match_10.csv"
df = pd.read_csv(csv_file_path)

In [12]:
encoder = LabelEncoder()
df['role'] = encoder.fit_transform(df['role'])
df['puuid'] = encoder.fit_transform(df['puuid']) 

In [14]:
X = df.drop(['win', 'match_id',"puuid"], axis=1)  
y = df['win']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [43]:
param_grid = {
        'n_estimators': [100, 200, 300],
        # 'max_depth': [None, 10, 20, 30],
        # 'min_samples_split': [2, 5, 10],
        # 'min_samples_leaf': [1, 2, 4],
        # 'bootstrap': [True, False]
    }

In [38]:
mlflow.set_tracking_uri(uri="http://13.209.9.231:5000")

In [49]:
with mlflow.start_run():
    # Train model with GridSearchCV
    model = ExtraTreesClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Best parameters from GridSearchCV
    best_params = grid_search.best_params_
    print("Best parameters found: ", best_params)

    # Log best parameters
    mlflow.log_params(best_params)

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Test Inference
    y_pred = best_model.predict(X_test)
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(cr)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)

    mlflow.sklearn.log_model(
        sk_model = best_model, 
        artifact_path = "extra-forest",
        signature = signature,
        input_example=X_train,
        registered_model_name="extra-test"
    )

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ...................................n_estimators=100; total time=   0.1s
[CV] END ...................................n_estimators=100; total time=   0.1s
[CV] END ...................................n_estimators=100; total time=   0.0s
[CV] END ...................................n_estimators=200; total time=   0.1s
[CV] END ...................................n_estimators=200; total time=   0.1s
[CV] END ...................................n_estimators=200; total time=   0.1s
[CV] END ...................................n_estimators=300; total time=   0.1s
[CV] END ...................................n_estimators=300; total time=   0.1s
[CV] END ...................................n_estimators=300; total time=   0.1s
Best parameters found:  {'n_estimators': 100}
Accuracy: 0.8695652173913043
Confusion Matrix:
[[ 9  2]
 [ 1 11]]
Classification Report:
              precision    recall  f1-score   support

       False       0.9

Successfully registered model 'extra-test'.
2024/06/11 16:33:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: extra-test, version 1
Created version '1' of model 'extra-test'.
