<a href="https://colab.research.google.com/github/Yosingh1/Machine_Learning/blob/main/Randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [45]:
data= sns.load_dataset("tips")

In [46]:
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [47]:
data.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [48]:

encoder= LabelEncoder()


In [49]:
data["time"]=encoder.fit_transform(data["time"])

In [50]:
x= data.drop("time", axis=1)

In [51]:
x

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.50,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,3
240,27.18,2.00,Female,Yes,Sat,2
241,22.67,2.00,Male,Yes,Sat,2
242,17.82,1.75,Male,No,Sat,2


In [52]:
y=data["time"]

In [53]:
y

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Name: time, Length: 244, dtype: int64

In [54]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

In [55]:
cat_column=["sex","smoker","day"]
num_column=["total_bill","tip","size"]

In [56]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())

    ]
)

cat_pipeline= Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encder",OneHotEncoder())

    ]
)

In [57]:
preprocessor=ColumnTransformer([

    ('num_pipeline',num_pipeline,num_column),
    ('cat_pipeline',cat_pipeline,cat_column)
])

In [58]:
preprocessor

In [59]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [60]:
X_test

array([[-0.04546101,  0.06468811, -0.61214068,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-1.30860871, -0.76316144, -0.61214068,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.4952348 , -0.76316144,  1.51942062,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.64841289,  1.45379161,  1.51942062,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.82506891, -0.76316144, -0.61214068,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.26608319, -0.76316144, -0.61214068,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.   

In [61]:
y_test

24     0
6      0
153    0
211    0
198    1
176    0
192    1
124    1
9      0
101    0
45     0
233    0
117    1
177    0
82     1
146    1
200    1
15     0
66     0
142    1
33     0
19     0
109    0
30     0
186    0
120    1
10     0
73     0
159    0
156    0
112    0
218    0
25     0
60     0
18     0
119    1
97     0
197    1
139    1
241    0
75     0
127    1
113    0
16     0
196    1
67     0
168    0
38     0
195    1
Name: time, dtype: int64

In [90]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [63]:
models={"random_forest":RandomForestClassifier(oob_score=True),
        "logistic_regression":LogisticRegression(),
        "decision_tree":DecisionTreeClassifier()}

In [64]:
# for i,j in models.items():
#   model= j.fit(X_train,y_train)
#   print(model)

In [65]:
list(models.values())[1]

In [87]:
def evaluate_model(X_train,y_train,X_test,y_test,models):
  report={}
  for i,j in models.items():
    model= j.fit(X_train,y_train)
    y_pred= model.predict(X_test)
    accuracy= round(accuracy_score(y_test,y_pred)*100,2)

    report[i]=accuracy
  return report




In [88]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'random_forest': 95.92, 'logistic_regression': 100.0, 'decision_tree': 93.88}

In [None]:
##hyperparameter tuning with dif-dif params

In [91]:
params={

    "n_estimators":[50,100,200],
    "criterion":["gini","entropy"],
    "max_depth":[3,5,10]

}


In [92]:
model=RandomForestClassifier(oob_score=True)

In [98]:
cv= RandomizedSearchCV(model,param_distributions=params, scoring="accuracy",cv=10,verbose=3)

In [99]:
cv

In [100]:
cv.fit(X_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV 1/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.950 total time=   0.6s
[CV 2/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=1.000 total time=   0.4s
[CV 3/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.900 total time=   0.3s
[CV 4/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.950 total time=   0.3s
[CV 5/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=1.000 total time=   0.3s
[CV 6/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 7/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.895 total time=   0.2s
[CV 8/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 9/10] END criterion=entropy, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 10/10] END criterion=entropy, max_depth=10, n_estimators=100

In [101]:
cv.best_params_

{'n_estimators': 50, 'max_depth': 5, 'criterion': 'entropy'}

In [103]:
bestmodel=RandomForestClassifier(n_estimators=50, max_depth= 5, criterion='entropy', oob_score=True)

In [104]:
bestmodel.fit(X_train,y_train)

In [108]:
y_pred= bestmodel.predict(X_test)

In [109]:
accuracy= accuracy_score(y_test,y_pred)

In [110]:
accuracy

0.9795918367346939