In [6]:
from pathlib import Path
import pandas as pd
import sys
sys.path.append('..')
import warnings
warnings.filterwarnings('ignore')

In [7]:
from datetime import datetime
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
pd.set_option("display.max_columns", None)

In [11]:
DATA_DIR = Path(r'../data/')
MODEL_DIR=Path(r'../models/')
df_master= pd.read_csv(DATA_DIR/"train.csv",index_col='ID')

In [12]:
from joblib import dump,load
from sklearn.ensemble import RandomForestClassifier
from src.app.preprocess import preprocess
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from pathlib import Path
DATA_DIR = Path(r'../data/')
PREDICTION_DIR = Path(r'../data/Predictions/')
MODEL_DIR=Path(r'../models/')
df_master= pd.read_csv(DATA_DIR/"train.csv",index_col='ID')

In [24]:
def build_model(data: pd.DataFrame):
    target=data['y'].apply(lambda x : 1 if x == 'yes' else 0)
    train = data.drop(['y'], axis=1)
    processedtrain= preprocess(train, MODEL_DIR)
    x_train , x_test , y_train , y_test = train_test_split(processedtrain,target,test_size=0.2)
    rfc = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
    rfc.fit(x_train, y_train)
    dump(rfc,MODEL_DIR/'RFC.joblib')
    predict = rfc.predict(x_test)
    print(f"The accuracy score is : {accuracy_score(y_test, predict) * 100}%")
    print (classification_report(y_test,predict))
    return   

In [25]:
build_model(df_master)

The accuracy score is : 77.03962703962704%
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1759
           1       0.70      0.48      0.57       815

    accuracy                           0.77      2574
   macro avg       0.74      0.69      0.71      2574
weighted avg       0.76      0.77      0.76      2574



In [26]:
def inference(data: pd.DataFrame,max_depth):
    target=data['y'].apply(lambda x : 1 if x == 'yes' else 0)
    train = data.drop(['y'], axis=1)
    processedtrain= preprocess(train, MODEL_DIR)
    x_train , x_test , y_train , y_test = train_test_split(processedtrain,target,test_size=0.2)
    
    rfc = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1,max_depth=max_depth)
    rfc.fit(x_train, y_train)
    y_pred = rfc.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Test mse = {mse}, Test RMSE = {rmse}, Random forest max depth = {max_depth}")
    return rfc, mse, rmse

In [27]:
_=inference(df_master,2)

Test mse = 0.2591297591297591, Test RMSE = 0.5090478947307012, Random forest max depth = 2


In [28]:
for max_depth in range(2, 7, 2):
    _ = inference(df_master, max_depth=max_depth)

Test mse = 0.26573426573426573, Test RMSE = 0.5154941956358633, Random forest max depth = 2
Test mse = 0.2463092463092463, Test RMSE = 0.4962955231605926, Random forest max depth = 4
Test mse = 0.2498057498057498, Test RMSE = 0.4998057120579454, Random forest max depth = 6


# MLFLOW

In [29]:
randomforest_exp = "prediction_with_random_forest"
mlflow.set_experiment(randomforest_exp)

2022/07/31 21:58:25 INFO mlflow.tracking.fluent: Experiment with name 'prediction_with_random_forest' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/Repositories/AI-project-methodology/MLFLOW/mlruns/1', experiment_id='1', lifecycle_stage='active', name='prediction_with_random_forest', tags={}>

In [32]:
def train_model(data: pd.DataFrame,max_depth):
    with mlflow.start_run():
        
        
        target=data['y'].apply(lambda x : 1 if x == 'yes' else 0)
        train = data.drop(['y'], axis=1)
        processedtrain= preprocess(train, MODEL_DIR)
        x_train , x_test , y_train , y_test = train_test_split(processedtrain,target,test_size=0.2)
    
        rfc = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1,max_depth=max_depth)
        rfc.fit(x_train, y_train)
        mlflow.log_param("max_depth", max_depth)
        mlflow.sklearn.log_model(rfc, "model")
        y_pred = rfc.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metrics({"testing_mse": mse, "testing_rmse": rmse})
        print(f"Test mse = {mse}, Test RMSE = {rmse}, Random forest max depth = {max_depth}")

In [33]:
for max_depth in range(2, 7, 2):
    _ = train_model(df_master, max_depth=max_depth)

Test mse = 0.24708624708624707, Test RMSE = 0.49707770729157336, Random forest max depth = 2
Test mse = 0.2556332556332556, Test RMSE = 0.5056018746338423, Random forest max depth = 4
Test mse = 0.24048174048174048, Test RMSE = 0.4903893763956765, Random forest max depth = 6


In [None]:
mlflow.get_experiment_by_name(randomforest_exp)

In [34]:
experiment_id = mlflow.get_experiment_by_name(randomforest_exp).experiment_id
experiment_id


'1'

In [35]:
mlflow.search_runs(experiment_id)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.testing_rmse,metrics.testing_mse,params.max_depth,tags.mlflow.source.name,tags.mlflow.source.git.commit,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.source.type
0,6ca9338e6dbb448083a4d0d7e84563b7,1,FINISHED,file:///D:/Repositories/AI-project-methodology...,2022-07-31 20:00:11.925000+00:00,2022-07-31 20:00:31.946000+00:00,0.490389,0.240482,6.0,D:\Repositories\AI-project-methodology\env\lib...,212e29448b4448ffe71d9305b4a81e47cbbfe23a,RAVITEJA,"[{""run_id"": ""6ca9338e6dbb448083a4d0d7e84563b7""...",LOCAL
1,ebbd4accf565429d8b0671b51155c50a,1,FINISHED,file:///D:/Repositories/AI-project-methodology...,2022-07-31 19:59:52.517000+00:00,2022-07-31 20:00:11.907000+00:00,0.505602,0.255633,4.0,D:\Repositories\AI-project-methodology\env\lib...,212e29448b4448ffe71d9305b4a81e47cbbfe23a,RAVITEJA,"[{""run_id"": ""ebbd4accf565429d8b0671b51155c50a""...",LOCAL
2,0fea235f0654471492ce809ad19d6549,1,FINISHED,file:///D:/Repositories/AI-project-methodology...,2022-07-31 19:59:33.780000+00:00,2022-07-31 19:59:52.483000+00:00,0.497078,0.247086,2.0,D:\Repositories\AI-project-methodology\env\lib...,212e29448b4448ffe71d9305b4a81e47cbbfe23a,RAVITEJA,"[{""run_id"": ""0fea235f0654471492ce809ad19d6549""...",LOCAL
3,8c848b2657f94f938a1b4b131f3a3055,1,FAILED,file:///D:/Repositories/AI-project-methodology...,2022-07-31 19:59:07.051000+00:00,2022-07-31 19:59:07.074000+00:00,,,,D:\Repositories\AI-project-methodology\env\lib...,212e29448b4448ffe71d9305b4a81e47cbbfe23a,RAVITEJA,,LOCAL


In [36]:
max_depth = 4
mlflow.search_runs(
    experiment_id,
    filter_string=f"params.max_depth = '{max_depth}' AND metrics.testing_mse <= 40",
    order_by=['metrics.testing_mse asc']
)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.testing_rmse,metrics.testing_mse,params.max_depth,tags.mlflow.source.name,tags.mlflow.source.git.commit,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.source.type
0,ebbd4accf565429d8b0671b51155c50a,1,FINISHED,file:///D:/Repositories/AI-project-methodology...,2022-07-31 19:59:52.517000+00:00,2022-07-31 20:00:11.907000+00:00,0.505602,0.255633,4,D:\Repositories\AI-project-methodology\env\lib...,212e29448b4448ffe71d9305b4a81e47cbbfe23a,RAVITEJA,"[{""run_id"": ""ebbd4accf565429d8b0671b51155c50a""...",LOCAL


In [37]:
run = mlflow.search_runs(
    experiment_id,
    filter_string=f"params.max_depth = '{max_depth}' AND metrics.testing_mse <= 30",
    order_by=["metrics.testing_mse asc"]
).iloc[0]
run

run_id                                            ebbd4accf565429d8b0671b51155c50a
experiment_id                                                                    1
status                                                                    FINISHED
artifact_uri                     file:///D:/Repositories/AI-project-methodology...
start_time                                        2022-07-31 19:59:52.517000+00:00
end_time                                          2022-07-31 20:00:11.907000+00:00
metrics.testing_rmse                                                      0.505602
metrics.testing_mse                                                       0.255633
params.max_depth                                                                 4
tags.mlflow.source.name          D:\Repositories\AI-project-methodology\env\lib...
tags.mlflow.source.git.commit             212e29448b4448ffe71d9305b4a81e47cbbfe23a
tags.mlflow.user                                                          RAVITEJA
tags

In [38]:
run.artifact_uri

'file:///D:/Repositories/AI-project-methodology/MLFLOW/mlruns/1/ebbd4accf565429d8b0671b51155c50a/artifacts'

In [39]:
model = mlflow.sklearn.load_model(model_uri=f"{run.artifact_uri}/model")
model

In [40]:
target = df_master['y']
train = df_master.drop(['y'], axis=1)
processedtrain= preprocess(train, MODEL_DIR)

In [41]:
model.predict(processedtrain)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)