In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [2]:
train_data = pd.read_csv('./datasets/train.csv')
test_data = pd.read_csv('./datasets/test.csv')

In [3]:
test_data = test_data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin', 'Embarked'],axis = 1)
train_data = train_data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin', 'Embarked'],axis = 1)

In [4]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
4,3,female,22.0,12.2875


In [5]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [7]:
x_train = train_data.drop('Survived',axis = 1)
x_train.replace({'male':1,'female':0},inplace = True)
test_data.replace({'male':1, 'female':0},inplace = True)
y_train = train_data['Survived']
x_test = test_data.values
x_test

array([[ 3.    ,  1.    , 34.5   ,  7.8292],
       [ 3.    ,  0.    , 47.    ,  7.    ],
       [ 2.    ,  1.    , 62.    ,  9.6875],
       ...,
       [ 3.    ,  1.    , 38.5   ,  7.25  ],
       [ 3.    ,  1.    ,     nan,  8.05  ],
       [ 3.    ,  1.    ,     nan, 22.3583]])

In [8]:
x_train.isnull().sum()

Pclass      0
Sex         0
Age       177
Fare        0
dtype: int64

In [9]:
x_train.Age = x_train.Age.fillna(x_train.Age.mean())
test_data.Age = test_data.Age.fillna(test_data.Age.mean())
test_data.Fare = test_data.Fare.fillna(test_data.Fare.mean())

In [10]:
x_train.isnull().sum()

Pclass    0
Sex       0
Age       0
Fare      0
dtype: int64

In [12]:
#Train the model
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train,y_train)

# Parameter of Grid

In [13]:
param_dict = {
    'criterion':['gini','entropy'],
    'max_depth':range(1,10),
    'min_samples_split':range(1,10),
    'min_samples_leaf':range(1,10)
}

# Finding the greatest parameter 

In [14]:
grid = GridSearchCV(
    decision_tree,
    param_grid = param_dict,
    cv = 10,
    n_jobs = 1
)
grid.fit(x_train,y_train)

1620 fits failed out of a total of 14580.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/home/student/.conda/envs/env012/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/student/.conda/envs/env012/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/home/student/.conda/envs/env012/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/student/.conda/envs/env012/lib/python3.9/site-packages/sklearn/base.py", line 600, in _validate_params
   

In [16]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'min_samples_leaf': 6,
 'min_samples_split': 5}

In [17]:
grid.best_estimator_

In [18]:
grid.best_score_

0.8294756554307116

# NEW

In [19]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [22]:
x,y = load_diabetes(return_X_y= True)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.33,random_state=42)


USING RANDOMSEARCH

In [23]:
random_search = RandomizedSearchCV(RandomForestRegressor(random_state=0),
                                   {'n_estimators':np.arange(5,100,5),
                                    'max_features':np.arange(0.1,1.0,0.05),
                                   },cv=5,scoring="r2",verbose=1, n_jobs=-1,
                                   n_iter=50,random_state=0
                                  )
random_search.fit(x_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [24]:
random_search.best_params_

{'n_estimators': 95, 'max_features': 0.5500000000000002}

In [25]:
random_search.best_estimator_

In [26]:
random_search.best_score_

0.41450360960321453

Grid Search

In [27]:
grid = GridSearchCV(RandomForestRegressor(random_state=0),
                                   {'n_estimators':np.arange(5,500,5),
                                    'max_features':np.arange(0.1,1.0,0.05),
                                   },cv=5,scoring="r2",verbose=1, n_jobs=-1,
                                  )
grid.fit(x_train, y_train)

Fitting 5 folds for each of 1782 candidates, totalling 8910 fits


In [28]:
grid.best_params_

{'max_features': 0.5000000000000001, 'n_estimators': 90}

In [29]:
grid.best_estimator_

In [30]:
grid.best_score_

0.41499526025667494