In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,QuantileTransformer,StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, roc_auc_score, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
def nan_imputer(df):

    # Numerical
    num_col_list = df.select_dtypes(include=[np.number]).columns
    si_num = SimpleImputer()

    # Categorical
    cat_col_list = df.select_dtypes(exclude=[np.number]).columns
    si_cat = SimpleImputer(strategy='most_frequent')

    df[num_col_list] = si_num.fit_transform(df[num_col_list])
    df[cat_col_list] = si_cat.fit_transform(df[cat_col_list])
    
    return df

In [3]:
def encoder(df):
    cat_col_list = df.select_dtypes(exclude=[np.number]).columns
    df[cat_col_list]= df[cat_col_list].apply(LabelEncoder().fit_transform)
    
    return df
    

In [4]:
def split_data(df, dep_variable, indep_variable, test_size=0.2):
    X = df[indep_variable]
    y = df[dep_variable]

    X_train, X_test, y_train,  y_test = train_test_split(X, y, random_state=42, test_size=test_size, stratify=y)
    return X_train, X_test, y_train, y_test 


In [42]:
def build_model(df, model):
    numeric_features = df.select_dtypes(include=[np.number]).columns
    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='median')),
               ('scaler', StandardScaler())]
    )

    categorical_features = df.select_dtypes(exclude=[np.number]).columns
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, numeric_features),
                     ('cat', categorical_transformer, categorical_features)
                     ]
    )

    clf = Pipeline(
        steps=[
            ('preprocessor',preprocessor),
               ("classifier",  model)
              ]
    )
    
    return clf
    

In [43]:
df = sns.load_dataset('penguins')
dep_variable = ['species']
indep_variable =[ 'island', 'bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g', 'sex']
X_train, X_test, y_train,  y_test = split_data(df, dep_variable, indep_variable, test_size=0.2)

In [91]:
trained_model = build_model(X_train, RandomForestClassifier())
trained_model.fit(X_train, y_train)
y_pred = trained_model.predict(pd.DataFrame([['Dream', 41.5, 18.5, 201.0, 4000.0, 'Male']], columns=X_train.columns))
y_pred

array(['Adelie'], dtype=object)

In [94]:
df.sample(10)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
290,Gentoo,Biscoe,47.7,15.0,216.0,4750.0,Female
84,Adelie,Dream,37.3,17.8,191.0,3350.0,Female
325,Gentoo,Biscoe,46.8,16.1,215.0,5500.0,Male
265,Gentoo,Biscoe,43.6,13.9,217.0,4900.0,Female
26,Adelie,Biscoe,40.6,18.6,183.0,3550.0,Male
23,Adelie,Biscoe,38.2,18.1,185.0,3950.0,Male
255,Gentoo,Biscoe,48.4,16.3,220.0,5400.0,Male
187,Chinstrap,Dream,47.5,16.8,199.0,3900.0,Female
191,Chinstrap,Dream,53.5,19.9,205.0,4500.0,Male
73,Adelie,Torgersen,45.8,18.9,197.0,4150.0,Male


In [16]:
models_list = [RandomForestClassifier(), DecisionTreeClassifier(), SVC(), LogisticRegression()]

In [17]:
for model in models_list:
    trained_model = build_model(X_train, y_train, model)
    y_pred = trained_model.predict(X_test)
    print(f'model = {model}:')
    print(classification_report(y_test, y_pred))
    print('-'*50)

model = RandomForestClassifier():
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

--------------------------------------------------
model = DecisionTreeClassifier():
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

--------------------------------------------------
model = SVC():
              precision    recall  f1-score   support

In [27]:
df['bill_length_mm'].mode()[0]

41.1

In [32]:
type(float(df['bill_length_mm'].min()))

float

In [29]:
df['bill_length_mm'].max()

59.6