## Random Forest Classifier With Pipeline And Hyperparameter Tuning

In [2]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df['day'].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [4]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [5]:
df['tip'].unique()

array([ 1.01,  1.66,  3.5 ,  3.31,  3.61,  4.71,  2.  ,  3.12,  1.96,
        3.23,  1.71,  5.  ,  1.57,  3.  ,  3.02,  3.92,  1.67,  3.71,
        3.35,  4.08,  2.75,  2.23,  7.58,  3.18,  2.34,  4.3 ,  1.45,
        2.5 ,  2.45,  3.27,  3.6 ,  3.07,  2.31,  2.24,  2.54,  3.06,
        1.32,  5.6 ,  6.  ,  2.05,  2.6 ,  5.2 ,  1.56,  4.34,  3.51,
        1.5 ,  1.76,  6.73,  3.21,  1.98,  3.76,  2.64,  3.15,  2.47,
        1.  ,  2.01,  2.09,  1.97,  3.14,  2.2 ,  1.25,  3.08,  4.  ,
        2.71,  3.4 ,  1.83,  2.03,  5.17,  5.85,  3.25,  4.73,  3.48,
        1.64,  4.06,  4.29,  2.55,  5.07,  1.8 ,  2.92,  1.68,  2.52,
        4.2 ,  1.48,  2.18,  2.83,  6.7 ,  2.3 ,  1.36,  1.63,  1.73,
        2.74,  5.14,  3.75,  2.61,  4.5 ,  1.61, 10.  ,  3.16,  5.15,
        3.11,  3.55,  3.68,  5.65,  6.5 ,  4.19,  2.56,  2.02,  1.44,
        3.41,  5.16,  9.  ,  1.1 ,  3.09,  1.92,  1.58,  2.72,  2.88,
        3.39,  1.47,  1.17,  4.67,  5.92,  1.75])

In [6]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])

In [8]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.0,Female,Yes,Sat,0,2
241,22.67,2.0,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2
243,18.78,3.0,Female,No,Thur,0,2


In [9]:
# independent and dependent features
X = df.drop(labels=['time'], axis=1)
y = df.time

In [10]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: time, dtype: int64

In [11]:
X['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=3)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer   # Handle missing values
from sklearn.preprocessing import StandardScaler # Feature scaling
from sklearn.preprocessing import OneHotEncoder # Categorical to numerical
from sklearn.compose import ColumnTransformer

In [14]:
categorical_col = ['sex', 'smoker', 'day']
numerical_col = ['total_bill', 'tip', 'size']

In [15]:
# feature engineering automation
# numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Missing Values
        ('scaler', StandardScaler())  # Feature Scaling
    ]
)

# Categorical pipeline

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),  
        ('onehotencoder', OneHotEncoder())
    ]
)

In [16]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_col),
    ('cat_pipeline', cat_pipeline, categorical_col)
])

In [17]:
preprocessor

In [18]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [20]:
## Model Training Automation
models = {
    'Random Forest' : RandomForestClassifier(),
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
}

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
def EvaluateModels(X_train, y_train, X_test, y_test, models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # Train model
        model.fit(X_train, y_train)
        
        # Predict Testing data
        y_test_pred = model.predict(X_test)
        
        # Get accuracy for test data prediction
        test_model_score = accuracy_score(y_test, y_test_pred)
        
        report[list(models.keys())[i]] = test_model_score
        
    return report

In [23]:
EvaluateModels(X_train, y_train, X_test, y_test, models)

{'Random Forest': 0.9591836734693877,
 'Logistic Regression': 0.9591836734693877,
 'Decision Tree': 0.9591836734693877}

In [24]:
classfier=RandomForestClassifier()

In [25]:
## Hypeparameter Tuning
params={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['gini','entropy']
              }

In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
cv=RandomizedSearchCV(classfier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[CV 1/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.872 total time=   0.4s
[CV 3/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.974 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=10, n_estimators=300;, score=1.000 total time=   0.3s
[CV 5/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.949 total time=   0.4s
[CV 1/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.974 total time=   0.3s
[CV 2/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.897 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.974 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=None, n_estimators=300;, score=1.000 total time=   0.3s
[CV 5/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.949 total time=   0.3s
[CV 1/5] END criterion=entropy, max_depth=3, n_estimators=20

In [28]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 3, 'criterion': 'entropy'}

## RandomForest Regression Solve it Internal Assignment