In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# filter warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset("tips")
df


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

## Pipeline

Encoding , Scaling, Missing Value treatment

In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["time"] = encoder.fit_transform(df["time"])

In [5]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [6]:
# target is to predict time

In [7]:
df.time.unique() # dinner is 0 and lunch is 1

array([0, 1])

In [8]:
X =  df.drop('time',axis=1)
y = df["time"]

In [9]:
X

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.50,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,3
240,27.18,2.00,Female,Yes,Sat,2
241,22.67,2.00,Male,Yes,Sat,2
242,17.82,1.75,Male,No,Sat,2


In [10]:
y

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Name: time, Length: 244, dtype: int32

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [12]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

## Pipeline
### handing missing value
### data encoding
###  feature scaling 

In [13]:
from sklearn.impute import SimpleImputer # for missing values
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Pipeline is a sequence of data transformation
# column transformer >> groups all the pipeline steps for each of the column

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [14]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [15]:
categorical_columns = ["sex","smoker","day"]
numerical_columns = ["total_bill","tip","size"]

In [16]:
# feature engineering automation using pipeline and column transformer

# below pipeline is for categorical columns
numerical_pipeline = Pipeline(steps=[('imputation',SimpleImputer(strategy="median")),
                                     ('scaling',StandardScaler())])

# below pipeline is for categorical columns
categorical_pipeline = Pipeline(steps=[('imputation',SimpleImputer(strategy="most_frequent")),
                                     ('encoding',OneHotEncoder())])

In [17]:
preprocessor = ColumnTransformer([("num_pipeline",numerical_pipeline,numerical_columns),
                   ("cat_pipeline",categorical_pipeline,categorical_columns)])

In [18]:
X_train = preprocessor.fit_transform(X_train)

In [19]:
X_test = preprocessor.transform(X_test)

In [20]:
X_train

array([[ 3.27957576,  2.93016025,  1.4480456 , ...,  1.        ,
         0.        ,  0.        ],
       [ 2.82704615, -0.34414084,  0.43500958, ...,  1.        ,
         0.        ,  0.        ],
       [-0.86956223, -1.01757865, -0.57802643, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.23123331,  0.42992561, -0.57802643, ...,  0.        ,
         0.        ,  1.        ],
       [-1.07473939, -1.3117239 , -0.57802643, ...,  1.        ,
         0.        ,  0.        ],
       [-0.29278646,  0.09707704,  0.43500958, ...,  1.        ,
         0.        ,  0.        ]])

In [21]:
X_test

array([[-1.87265054, -1.50524051, -1.59106245,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.08190994,  0.04289239, -0.57802643,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.80833093,  0.36025963,  0.43500958,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.59713257, -0.34414084, -0.57802643,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.18937985,  0.04289239, -0.57802643,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-1.34032981, -1.16465127, -0.57802643,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.   

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


models = {"svc":SVC(),
          "dt_clf":DecisionTreeClassifier(),
          "log_clf":LogisticRegression()}

In [23]:
models.values()

dict_values([SVC(), DecisionTreeClassifier(), LogisticRegression()])

In [24]:
models.keys()

dict_keys(['svc', 'dt_clf', 'log_clf'])

In [25]:
from sklearn.metrics import accuracy_score

def model_train_eval(X_train, y_train, X_test, y_test, models):
    evaluation= {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_score= accuracy_score(y_test,y_pred)
        evaluation[list(models.keys())[i]] = model_score
    return evaluation


In [26]:
model_train_eval(X_train, y_train, X_test, y_test, models)

{'svc': 0.9324324324324325,
 'dt_clf': 0.9054054054054054,
 'log_clf': 0.9324324324324325}

#### Random Forest Classifier

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [28]:
rf.fit(X_train,y_train)

In [29]:
y_pred = rf.predict(X_test)

In [30]:
accuracy_score(y_test,y_pred)

0.9324324324324325

#### hyper

In [31]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth':[1,2,4,6,8,10,20,30],
          'n_estimators':[50,100,150,200,250, 400,1000],
          'criterion':['gini','entropy']}

Fitting 5 folds for each of 112 candidates(Total of total_max_depthXtotal_n_estimatorsXtotal_criterion), totalling 560 (112 * 5 folds) fits

In [33]:
grid_cv= GridSearchCV(RandomForestClassifier(), param_grid= params, cv=5, verbose=3)

In [34]:
grid_cv.fit(X_train,y_train)

Fitting 5 folds for each of 112 candidates, totalling 560 fits
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.941 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.824 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=50;, score=1.000 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.912 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=50;, score=1.000 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.941 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.971 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=100;, score=1.000 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=100;, score=1.000 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.853 total time=   0.0s
[CV 1/5] END criterion

In [35]:
grid_cv.best_params_

{'criterion': 'gini', 'max_depth': 1, 'n_estimators': 150}

In [36]:
grid_cv.best_score_

0.9823529411764707