In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("data/mushrooms.csv")

In [3]:
# mapping = {"p": 1, "e": 0}
# df["class"] = df["class"].apply(lambda x: mapping[x])

df["class"] = df["class"].apply(lambda x: {"p" : 1, "e": 0}[x])

In [5]:
df["class"]


0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, Length: 8124, dtype: int64

In [6]:
train_set, test_set = train_test_split(df, test_size = 0.3, random_state = 10)

In [7]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [8]:
lab_cols = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

In [9]:
#target pipeline

target_pipeline = Pipeline(
    steps=[
        ("ordinal_encoder", OrdinalEncoder()),
        ("pca", PCA(n_components=10))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("lab_pipeline", target_pipeline, lab_cols)
    ]
)


In [10]:
target_column = "class"

In [11]:
drop_columns = [target_column]

In [12]:
input_feature_train_df = train_set.drop(columns = drop_columns, axis = 1)
target_feature_train_df = train_set[target_column]

In [13]:
input_feature_test_df = test_set.drop(columns = drop_columns, axis = 1)
target_feature_test_df = test_set[target_column]

In [14]:
input_feature_train_arr = preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessor.transform(input_feature_test_df)

In [15]:
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]


In [16]:
input_feature_train_arr

array([[ 6.40005874,  0.71737581,  0.77474753, ..., -0.85749707,
         0.25743174, -0.21809242],
       [-5.10338118, -3.87298142,  2.50305722, ..., -0.93225889,
        -2.61325944,  1.13597538],
       [-0.6933122 , -1.93861363,  0.13861589, ...,  0.98916004,
        -0.42351017,  1.81023608],
       ...,
       [-0.45774016,  3.15935309,  5.95314147, ..., -1.3060209 ,
        -2.49596369, -1.61180861],
       [-2.42778227, -5.5042774 , -2.17134532, ..., -1.26049523,
         0.64643389,  0.35123841],
       [-0.55061669,  1.99372197, -3.88513224, ..., -1.92486099,
         0.87545126, -1.54869207]])

In [17]:
target_feature_train_df

6506    1
2314    0
2898    0
1178    0
2341    0
       ..
3441    1
1344    0
4623    1
7293    0
1289    0
Name: class, Length: 5686, dtype: int64

In [18]:
train_arr

array([[ 6.40005874,  0.71737581,  0.77474753, ...,  0.25743174,
        -0.21809242,  1.        ],
       [-5.10338118, -3.87298142,  2.50305722, ..., -2.61325944,
         1.13597538,  0.        ],
       [-0.6933122 , -1.93861363,  0.13861589, ..., -0.42351017,
         1.81023608,  0.        ],
       ...,
       [-0.45774016,  3.15935309,  5.95314147, ..., -2.49596369,
        -1.61180861,  1.        ],
       [-2.42778227, -5.5042774 , -2.17134532, ...,  0.64643389,
         0.35123841,  0.        ],
       [-0.55061669,  1.99372197, -3.88513224, ...,  0.87545126,
        -1.54869207,  0.        ]])

In [19]:
test_arr

array([[-1.52960887,  3.01150973, -3.09623674, ..., -0.31633862,
        -1.1045539 ,  0.        ],
       [-2.11216056, -1.76907499, -2.01350501, ..., -1.23054355,
         0.23710009,  1.        ],
       [-0.62662671,  2.04344835, -2.98526961, ...,  0.34822784,
        -1.26227765,  0.        ],
       ...,
       [ 2.27413708,  0.13396777, -1.67821525, ..., -0.66244189,
         0.69143697,  1.        ],
       [-0.97865513, -1.21011499,  4.3710846 , ..., -0.24964597,
        -1.18054038,  0.        ],
       [-5.75455549,  0.70522095, -4.61199863, ..., -0.07557396,
        -0.0329173 ,  0.        ]])

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
def evaluate_model(X_train, y_train, X_test, y_test, models):
    report = {}
    
    for i in range(len(models)):
        model = list(models.values())[i]
            
        model.fit(X_train, y_train)
            
            #Predicting value
        y_test_pred = model.predict(X_test)
            
            #getting accuracy score
        test_model_score = accuracy_score(y_test, y_test_pred)
            
        report[list(models.keys())[i]] = test_model_score
                
    return report

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import logging

In [23]:
def initiate_model_training(self, train_array, test_array):

        logging.info("Splitting dependent & Independent data from train & test data")
        X_train, y_train, X_test, y_test = (
            train_array[:,:-1],
            train_array[:, -1],
            test_array[:, :-1],
            test_array[:, -1]
        )
            
        models = {
            "LogisticRegression" : LogisticRegression(),
            "LogisticRegressionCV" : LogisticRegressionCV(),
            "KNN" : KNeighborsClassifier(),
            "Decision Tree" : DecisionTreeClassifier(),
            "SVC" : SVC(),
            "RandomForest" : RandomForestClassifier(),
            "GradientBoosting" : GradientBoostingClassifier()
                
        }
            
        model_report:dict=evaluate_model(X_train, y_train, X_test, y_test, models)
        print(model_report)
            
        print("\n**********")
        logging.info(f"Model Report : {model_report}")
            
            #to get best score & model
        best_model_score = max(sorted(model_report.values()))
            
        best_model_name = list(model_report.keys())[
            list(model_report.values()).index(best_model_score)
        ]
            
        best_model = models[best_model_name]
        print(f"Best Model is {best_model_name} with accuracy : {best_model_score}")
        print("\n*****")

In [24]:
import os, sys
from dataclasses import dataclass

In [25]:
@dataclass
class ModelTrainerConfig:
    trained_model_file_path = os.path.join("artifacts", "model.pkl")
    
    
class ModelTrainer:
    def __init__(self):
        self.model_trainer_config = ModelTrainerConfig()
        
    def initiate_model_training(train_array, test_array):
        
        try:
            logging.info("Splitting dependent & Independent data from train & test data")
            X_train, y_train, X_test, y_test = (
                train_array[:,:-1],
                train_array[:, -1],
                test_array[:, :-1],
                test_array[:, -1]
            )
            
            models = {
                "LogisticRegression" : LogisticRegression(),
                "LogisticRegressionCV" : LogisticRegressionCV(),
                "KNN" : KNeighborsClassifier(),
                "Decision Tree" : DecisionTreeClassifier(),
                "SVC" : SVC(),
                "RandomForest" : RandomForestClassifier(),
                "GradientBoosting" : GradientBoostingClassifier()
                
            }
            
            model_report:dict=evaluate_model(X_train, y_train, X_test, y_test, models)
            print(model_report)
            
            print("\n**********")
            logging.info(f"Model Report : {model_report}")
            
            #to get best score & model
            best_model_score = max(sorted(model_report.values()))
            
            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            
            best_model = models[best_model_name]
            print(f"Best Model is {best_model_name} with accuracy : {best_model_score}")
            print("\n*****")
            
            logging.info(f"Best Model is {best_model_name} with accuracy : {best_model_score}")
            
        except Exception as e:
            logging.info("Exception occur at Model Training")
            print(f"{e}")

In [26]:
ModelTrainer.initiate_model_training(train_arr,test_arr)

{'LogisticRegression': 0.8240360951599672, 'LogisticRegressionCV': 0.8232157506152584, 'KNN': 0.9844134536505332, 'Decision Tree': 0.9848236259228876, 'SVC': 0.9827727645611156, 'RandomForest': 0.994667760459393, 'GradientBoosting': 0.9552912223133716}

**********
Best Model is RandomForest with accuracy : 0.994667760459393

*****
