In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from hyperopt import fmin, tpe, hp # hyper parameter tuning

In [5]:
# read data from csv
def read_csv(file_path):
    return pd.read_csv(file_path)

# creating feature
def create_feature(data):
    return data

# Training a classifier model
def train_classifier(data):
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=45, stratify=y)
    model= RandomForestClassifier()
    model.fit(x_train, y_train)
    y_pred= model.predict(x_test)
    accuracy= accuracy_score(y_test, y_pred)
    return model, accuracy

# Hyper parameter tuning
def objective(param):
    model= RandomForestClassifier(**param)
    score= cross_val_score(model, x, y, cv=5).mean()
    return score

# Evaluating model on the test set
def evaluate_model(model, x_test, y_test):
    y_pred= model.predict(x_test)
    accuracy= accuracy_score(y_test, y_pred)
    return accuracy

In [6]:
file_path = "./Iris.csv"
data= read_csv(file_path)
data

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [10]:
# if __name__ == "__main__"
# Load data
file_path= "./Iris.csv"
data= read_csv(file_path)

#create_feature(data)
data= create_feature(data)

# split data to feature and target
x= data.drop('Species', axis=1)
y= data['Species']

# split the data into training and test
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=45, stratify=y)

# Define the pipeline
pipeline= Pipeline([
    ("preprocessor", ColumnTransformer(transformers=[('num', StandardScaler(), x.columns)], remainder='passthrough')),
    ("classifier", RandomForestClassifier())
])

pipeline.fit(x_train, y_train)
# Evaluate the model

y_pred= pipeline.predict(x_test)
accuracy= accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy}")

#Hyper parameter tuning using tree of parzen estimation (tpe)

space = {'n_estimators' :hp.choice('n_estimators', range(10, 101)),
        'max_depth' :hp.choice('max_depth', range(1, 21))
        }
best_param= fmin(fn =objective, space=space, algo=tpe.suggest, max_evals=100)

Model accuracy on test set: 1.0
100%|██████████| 100/100 [00:40<00:00,  2.46trial/s, best loss: 0.9]              


## Serialization and Deserialization

In [None]:
# Serialization
import joblib
joblib.dump(pipeline, "trained_model")

In [None]:
# Deserialization
final_model= joblib.load("trained_model")

In [7]:
# Packaging and modeling
