In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,QuantileTransformer,StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, roc_auc_score, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
def nan_imputer(df):

    # Numerical
    num_col_list = df.select_dtypes(include=[np.number]).columns
    si_num = SimpleImputer()

    # Categorical
    cat_col_list = df.select_dtypes(exclude=[np.number]).columns
    si_cat = SimpleImputer(strategy='most_frequent')

    df[num_col_list] = si_num.fit_transform(df[num_col_list])
    df[cat_col_list] = si_cat.fit_transform(df[cat_col_list])
    
    return df

In [3]:
def encoder(df):
    cat_col_list = df.select_dtypes(exclude=[np.number]).columns
    df[cat_col_list]= df[cat_col_list].apply(LabelEncoder().fit_transform)
    
    return df
    

In [4]:
def split_data(df, dep_variable, indep_variable, test_size=0.2):
    X = df[indep_variable]
    y = df[dep_variable]

    X_train, X_test, y_train,  y_test = train_test_split(X, y, random_state=42, test_size=test_size, stratify=y)
    return X_train, X_test, y_train, y_test 


In [42]:
def build_model(df, model):
    numeric_features = df.select_dtypes(include=[np.number]).columns
    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='median')),
               ('scaler', StandardScaler())]
    )

    categorical_features = df.select_dtypes(exclude=[np.number]).columns
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, numeric_features),
                     ('cat', categorical_transformer, categorical_features)
                     ]
    )

    clf = Pipeline(
        steps=[
            ('preprocessor',preprocessor),
               ("classifier",  model)
              ]
    )
    
    return clf
    

In [43]:
df = sns.load_dataset('penguins')
# df = nan_imputer(df)

# df = encoder(df)
# df.head()
dep_variable = ['species']
indep_variable =[ 'island', 'bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g', 'sex']
X_train, X_test, y_train,  y_test = split_data(df, dep_variable, indep_variable, test_size=0.2)

In [71]:
trained_model = build_model(X_train, RandomForestClassifier())
trained_model.fit(X_train, y_train)
y_pred = trained_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        30
   Chinstrap       1.00      1.00      1.00        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [75]:
pdX_test.iloc[0]

pandas.core.series.Series

In [72]:
X_test

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
151,Dream,41.5,18.5,201.0,4000.0,Male
1,Torgersen,39.5,17.4,186.0,3800.0,Female
294,Biscoe,46.4,15.0,216.0,4700.0,Female
337,Biscoe,48.8,16.2,222.0,6000.0,Male
158,Dream,46.1,18.2,178.0,3250.0,Female
...,...,...,...,...,...,...
164,Dream,47.0,17.3,185.0,3700.0,Female
236,Biscoe,42.0,13.5,210.0,4150.0,Female
52,Biscoe,35.0,17.9,190.0,3450.0,Female
121,Torgersen,37.7,19.8,198.0,3500.0,Male


In [66]:
pd.DataFrame([['Dream', 33.1, 16.1, 178, 2900, 'Male']], columns=X_train.columns).iloc[0]

island               Dream
bill_length_mm        33.1
bill_depth_mm         16.1
flipper_length_mm      178
body_mass_g           2900
sex                   Male
Name: 0, dtype: object

In [63]:
pd.DataFrame([['Dream', 33.1, 16.1, 178, 2900, 'Male']])

Unnamed: 0,0,1,2,3,4,5
0,Dream,33.1,16.1,178,2900,Male


In [59]:
X_test.iloc[0]

island                Dream
bill_length_mm         41.5
bill_depth_mm          18.5
flipper_length_mm     201.0
body_mass_g          4000.0
sex                    Male
Name: 151, dtype: object

In [56]:
X_train.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
98,Dream,33.1,16.1,178.0,2900.0,Female
114,Biscoe,39.6,20.7,191.0,3900.0,Female
118,Torgersen,35.7,17.0,189.0,3350.0,Female
303,Biscoe,50.0,15.9,224.0,5350.0,Male
343,Biscoe,49.9,16.1,213.0,5400.0,Male


In [50]:
y_pred

array(['Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Chinstrap', 'Adelie',
       'Chinstrap', 'Gentoo', 'Chinstrap', 'Chinstrap', 'Adelie',
       'Gentoo', 'Chinstrap', 'Gentoo', 'Gentoo', 'Gentoo', 'Adelie',
       'Adelie', 'Gentoo', 'Gentoo', 'Chinstrap', 'Gentoo', 'Gentoo',
       'Adelie', 'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo',
       'Gentoo', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo', 'Chinstrap',
       'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap', 'Adelie',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Chinstrap',
       'Adelie', 'Gentoo', 'Chinstrap', 'Adelie', 'Chinstrap', 'Adelie',
       'Gentoo', 'Gentoo', 'Chinstrap', 'Gentoo', 'Gentoo', 'Chinstrap',
       'Gentoo', 'Adelie', 'Adelie', 'Gentoo'], dtype=object)

In [16]:
models_list = [RandomForestClassifier(), DecisionTreeClassifier(), SVC(), LogisticRegression()]

In [17]:
for model in models_list:
    trained_model = build_model(X_train, y_train, model)
    y_pred = trained_model.predict(X_test)
    print(f'model = {model}:')
    print(classification_report(y_test, y_pred))
    print('-'*50)

model = RandomForestClassifier():
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

--------------------------------------------------
model = DecisionTreeClassifier():
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

--------------------------------------------------
model = SVC():
              precision    recall  f1-score   support

In [27]:
df['bill_length_mm'].mode()[0]

41.1

In [32]:
type(float(df['bill_length_mm'].min()))

float

In [29]:
df['bill_length_mm'].max()

59.6