In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [4]:
df = sns.load_dataset('tips')

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# Predict what s the time, if its lunch or dinner>> time is a target variable

In [13]:
df.time.unique()

array([0, 1])

# EDA 

In [14]:
#EDA . Subjective
# Encoding, missing value treatment, scalling>> automate

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    int32   
 6   size        244 non-null    int64   
dtypes: category(3), float64(2), int32(1), int64(1)
memory usage: 8.0 KB


# Since time is a nomial variable, we will use label encoder

In [15]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['time']= encoder.fit_transform(df['time'])
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [16]:
df.time.unique() # Dinner is Converted to 1, n Lunch = 0

array([0, 1], dtype=int64)

In [17]:
X = df.drop('time', axis = 1)
y = df['time']

In [18]:
X

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.50,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,3
240,27.18,2.00,Female,Yes,Sat,2
241,22.67,2.00,Male,Yes,Sat,2
242,17.82,1.75,Male,No,Sat,2


In [19]:
y

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Name: time, Length: 244, dtype: int64

# Train Test Split

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.20, random_state=1)

In [24]:
X_train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,size
131,20.27,2.83,Female,No,Thur,2
231,15.69,3.0,Male,Yes,Sat,3
30,9.55,1.45,Male,No,Sat,2
121,13.42,1.68,Female,No,Thur,2
175,32.9,3.11,Male,Yes,Sun,2


In [25]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

# Handling the missing values
# Data Encoding
# Feature Scaling 

In [34]:
from sklearn.impute import SimpleImputer # For Missing Values
from sklearn.preprocessing import OneHotEncoder # For Encoding
from sklearn.preprocessing import StandardScaler # For Scaling 

from sklearn.pipeline import Pipeline # Sequence of Data Transformers in orders 
from sklearn.compose import ColumnTransformer # group all the steps for specific features

In [35]:
cat_cols=['sex', 'smoker', 'day']
num_cols=['total_bill', 'tip', 'size']

# Feature Engineering using pipeline and columntransformer

In [40]:
num_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy='median')),
    ('scaling', StandardScaler())
])

In [41]:
cat_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
])

In [47]:
preprocessor = ColumnTransformer([('num_pipeline', num_pipeline, num_cols),
                  ('cat_pipeline', cat_pipeline, cat_cols)])

In [48]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [49]:
X_train

array([[ 0.04894847, -0.10187543, -0.59339083,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.45720545,  0.01455363,  0.3560345 ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-1.13576114, -1.04700552, -0.59339083,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.70807213, -0.88948384, -0.59339083,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.44473975,  0.08989009, -0.59339083,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-0.27817284,  0.35699207, -0.59339083,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.   

In [50]:
X_test

array([[-1.85189158, -1.35520011, -1.54281616, ...,  1.        ,
         0.        ,  0.        ],
       [-0.11571732,  0.01455363, -0.59339083, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.74739668,  0.29535315,  0.3560345 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 2.08903611,  0.01455363,  1.30545982, ...,  1.        ,
         0.        ,  0.        ],
       [-0.19749765,  0.01455363, -0.59339083, ...,  0.        ,
         1.        ,  0.        ],
       [-1.36120961, -0.67032324, -0.59339083, ...,  0.        ,
         0.        ,  1.        ]])

# Random Forest Classifier

In [95]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn. linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [96]:
models = {'support vector classifier': SVC(),
          'DT classifier': DecisionTreeClassifier(),
         'logistic regression': LogisticRegression(),
          'Random_Forest': RandomForestClassifier()}

In [97]:
from sklearn.metrics import accuracy_score

def model_train_eval(X_train, y_train, X_test, y_test, models):
    evaluation = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_score = accuracy_score(y_test, y_pred)
        evaluation[name] = model_score
    return evaluation

In [98]:
model_train_eval(X_train, y_train, X_test, y_test, models)

{'support vector classifier': 0.9387755102040817,
 'DT classifier': 0.9591836734693877,
 'logistic regression': 0.9591836734693877,
 'Random_Forest': 0.9591836734693877}

In [99]:
from sklearn.ensemble import RandomForestClassifier

In [100]:
rf = RandomForestClassifier()

In [101]:
rf

In [102]:
X_train, X_test

(array([[ 0.04894847, -0.10187543, -0.59339083,  1.        ,  0.        ,
          1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          1.        ],
        [-0.45720545,  0.01455363,  0.3560345 ,  0.        ,  1.        ,
          0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
          0.        ],
        [-1.13576114, -1.04700552, -0.59339083,  0.        ,  1.        ,
          1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
          0.        ],
        [-0.70807213, -0.88948384, -0.59339083,  1.        ,  0.        ,
          1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          1.        ],
        [ 1.44473975,  0.08989009, -0.59339083,  0.        ,  1.        ,
          0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
          0.        ],
        [-0.27817284,  0.35699207, -0.59339083,  1.        ,  0.        ,
          1.        ,  0.        ,  0.        ,  0.        ,  1.       

In [103]:
from sklearn.model_selection import RandomizedSearchCV
params = {'max_depth':[1,2,3,5,10,None],
         'n_estimators':[50,100,200,300],
         'criterion': ['gini', 'entropy']}

In [104]:
params

{'max_depth': [1, 2, 3, 5, 10, None],
 'n_estimators': [50, 100, 200, 300],
 'criterion': ['gini', 'entropy']}

In [105]:
clf = RandomizedSearchCV(rf, param_distributions = params, cv = 5, verbose = 3, scoring = 'accuracy')

In [106]:
clf

In [107]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=50;, score=1.000 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=1.000 total time=   0.0s
[CV 1/5] END c

In [108]:
clf.best_params_

{'n_estimators': 50, 'max_depth': 5, 'criterion': 'gini'}

In [109]:
clf.best_score_

1.0

# OOB Score 

In [112]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Generate synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)

# Fit the classifier
rf_classifier.fit(X, y)

# Get the out-of-bag score
oob_score = rf_classifier.oob_score_
print('Out-of-Bag Score:', oob_score)

Out-of-Bag Score: 0.895


# Random Forest Regressor 

In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [116]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
# Predict what the time ? 
df.time.unique()