Searching for parameters that we can use for our `AdaBoostClassifier` using the technique of
`GridSearchCV`.
We'll improvise on the following parameters:

   `algorithm`, default: 'SAMME.R',

   `learning_rate`, default: 1.0,

   `n_estimators`, default: 50,


In [1]:
param_dict = {
    "algorithm": 'SAMME.R',
    "learning_rate": 1.0,
    "n_estimators": 50
}

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from pathlib import Path

np.printoptions(precision=3)

<contextlib._GeneratorContextManager at 0x2e0ef990ec8>

In [3]:
base = Path().resolve().parents[1] / r'data/subsets/gender_final_small'
locations = [str(base) + str(i) + '.csv' for i in range(1, 10)]

# Have 9000 names sampled at random.
X = pd.concat([pd.read_csv(location) for location in locations], axis=0)
y = X["Gender"]
X.drop(labels="Gender", inplace=True, axis=1)

In [4]:
# vectorize and convert to floats
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,10),
                             decode_error='replace', binary=True)
X = vectorizer.fit_transform(X['Name'])

scaler = MaxAbsScaler()
X = scaler.fit_transform(X)

In [5]:
# Create an instance of ABC
abc_clf = AdaBoostClassifier(algorithm=param_dict["algorithm"],
                             learning_rate=param_dict["learning_rate"],
                             n_estimators=param_dict["n_estimators"])

In [7]:
param_grid = {
    "algorithm": ["SAMME.R", "SAMME"],
    "learning_rate": np.linspace(0.3, 3, 10),
    "n_estimators": np.arange(100, 501, 100)
}

In [8]:
search = GridSearchCV(abc_clf, param_grid, n_jobs=-1,
                      cv=3, return_train_score=True, verbose=3)
search.fit(X, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['SAMME.R', 'SAMME'],
                         'learning_rate': array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3. ]),
                         'n_estimators': array([100, 200, 300, 400, 500])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [9]:
print(search.best_params_)


{'algorithm': 'SAMME.R', 'learning_rate': 0.9000000000000001, 'n_estimators': 500}


In [10]:
print(search.best_score_)


0.7657777777777778
