<a href="https://colab.research.google.com/github/bishalpaudel/SentimentAnalysis/blob/master/Hyperparameter_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-optimize
!pip install optuna

In [19]:
import pandas as pd
import numpy as np

from sklearn import ensemble, metrics, model_selection, preprocessing, decomposition, pipeline
from functools import partial

from skopt import space, gp_minimize

from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope

import optuna

In [3]:
def importFile():
  from google.colab import drive
  drive.mount('/content/drive')
  train_raw = pd.read_csv("/content/drive/My Drive/AI/Optimization/train.csv", delimiter=",")
  test_raw = pd.read_csv("/content/drive/My Drive/AI/Optimization/test.csv", delimiter=",")
  return train_raw, test_raw

In [4]:
def grid_search(X, y):
  classifier = ensemble.RandomForestClassifier(n_jobs=-1)
  param_grid = {
      "n_estimators": [100, 200, 300, 400],
      "max_depth": [1, 3, 5, 7],
      "criterion": ["gini", "entropy"],
  }

  model = model_selection.GridSearchCV(
      estimator = classifier,
      param_grid = param_grid,
      scoring = "accuracy",
      verbose = 10,
      n_jobs = 1,
      cv = 5
  )

  model.fit(X, y)

  print(model.best_score_)
  print(model.best_estimator_.get_params())

In [5]:
def random_search(X, y):
  classifier = ensemble.RandomForestClassifier(n_jobs=-1)
  param_grid = {
      "n_estimators": np.arange(100, 1500, 100),
      "max_depth": np.arange(1, 20),
      "criterion": ["gini", "entropy"],
  }

  model = model_selection.RandomizedSearchCV(
      estimator = classifier,
      param_distributions = param_grid,
      n_iter = 10,
      scoring = "accuracy",
      verbose = 10,
      n_jobs = 1,
      cv = 5
  )

  model.fit(X, y)

  print(model.best_score_)
  print(model.best_estimator_.get_params())

In [6]:
def pipeline_random_search(X, y):
  scl = preprocessing.StandardScaler()
  pca = decomposition.PCA()
  rf = ensemble.RandomForestClassifier(n_jobs=-1)

  classifier = pipeline.Pipeline([("scaling", scl), ("pca", pca), ("rf", rf)])

  param_grid = {
      "pca__n_components" : np.arange(5, 10),
      "rf__n_estimators": np.arange(100, 1500, 100),
      "rf__max_depth": np.arange(1, 20),
      "rf__criterion": ["gini", "entropy"],
  }

  model = model_selection.RandomizedSearchCV(
      estimator = classifier,
      param_distributions = param_grid,
      n_iter = 10,
      scoring = "accuracy",
      verbose = 10,
      n_jobs = 1,
      cv = 5
  )

  model.fit(X, y)

  print(model.best_score_)
  print(model.best_estimator_.get_params())

In [7]:
def optimize(params, param_names, x, y):
  params = dict(zip(param_names, params))
  model = ensemble.RandomForestClassifier(**params)
  kf = model_selection.StratifiedKFold(n_splits=5)
  accuracies = []

  for idx in kf.split(X=x, y=y):
    train_idx, test_idx = idx[0], idx[1]
    xtrain = x[train_idx]
    ytrain = y[train_idx]

    xtest = x[test_idx]
    ytest = y[test_idx]

    model.fit(xtrain, ytrain)
    preds = model.predict(xtest)
    fold_acc = metrics.accuracy_score(ytest, preds)
    accuracies.append(fold_acc)

  return -1.0 * np.mean(accuracies)

In [38]:
def optuna_optimizer(trial, x, y):
  criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
  n_estimators = trial.suggest_int("n_estimators", 100, 1500)
  max_depth = trial.suggest_int("max_depth", 3, 15)
  max_features = trial.suggest_uniform("max_features", 0.01, 1.0)

  model = ensemble.RandomForestClassifier(
      criterion=criterion,
      n_estimators=n_estimators,
      max_depth=max_depth,
      max_features=max_features
  )
  kf = model_selection.StratifiedKFold(n_splits=5)
  accuracies = []

  for idx in kf.split(X=x, y=y):
    train_idx, test_idx = idx[0], idx[1]
    xtrain = x[train_idx]
    ytrain = y[train_idx]

    xtest = x[test_idx]
    ytest = y[test_idx]

    model.fit(xtrain, ytrain)
    preds = model.predict(xtest)
    fold_acc = metrics.accuracy_score(ytest, preds)
    accuracies.append(fold_acc)

  return -1.0 * np.mean(accuracies)

In [8]:
def bayesian_optimization(X, y):
  param_space = [
      space.Integer(3, 15, name="max_depth"),
      space.Integer(100, 600, name="n_estimators"),
      space.Categorical(["gini", "entropy"], name="criterion"),
      space.Real(0.01, 1, prior="uniform", name="max_features")
  ]

  param_names = [
    "max_depth",
    "n_estimators",
    "criterion",
    "max_features"
  ]

  optimization_function = partial(
      optimize,
      param_names=param_names,
      x=X,
      y=y
  )

  result = gp_minimize(
      optimization_function,
      dimensions=param_space,
      n_calls=15,
      n_random_starts=10,
      verbose=10
  )

  print(dict(zip(param_names, result.x)))

In [27]:
def hyperopt_optimization():
  param_space = {
      "max_depth": scope.int(hp.quniform("max_depth", 3, 15)),
      "n_estimators": scope.int(hp.quniform("n_estimators", 100, 600)),
      "criterion": hp.choice("criterion", ["gini", "entropy"]),
      "max_features": hp.uniform("max_features", 0.01, 1)
  }

  optimization_function = partial(
      optimize,
      x=X,
      y=y
  )

  trials = Trials()

  result = gp_minimize(
      optimization_function,
      dimensions=param_space,
      n_calls=15,
      n_random_starts=10,
      verbose=10
  )

  print(dict(zip(param_names, result.x)))

In [36]:
def optuna_optimization(X, y):
  optimization_function = partial(optuna_optimizer, x=X, y=y)

  study = optuna.create_study(direction="minimize")
  study.optimize(optimization_function, n_trials=15)

In [9]:
  train_raw, test_raw = importFile()
  X = train_raw.drop("price_range", axis = 1).values
  y = train_raw.price_range.values

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
grid_search(X, y)

In [None]:
random_search(X, y)

In [None]:
pipeline_random_search(X, y)

In [None]:
bayesian_optimization(X, y)

In [39]:
optuna_optimization(X, y)

[I 2020-09-02 16:40:34,190] Trial 0 finished with value: -0.8905000000000001 and parameters: {'criterion': 'gini', 'n_estimators': 795, 'max_depth': 8, 'max_features': 0.3665390990758131}. Best is trial 0 with value: -0.8905000000000001.
[I 2020-09-02 16:41:05,163] Trial 1 finished with value: -0.8960000000000001 and parameters: {'criterion': 'gini', 'n_estimators': 1201, 'max_depth': 9, 'max_features': 0.45660113819010434}. Best is trial 1 with value: -0.8960000000000001.
[I 2020-09-02 16:41:11,893] Trial 2 finished with value: -0.841 and parameters: {'criterion': 'gini', 'n_estimators': 342, 'max_depth': 4, 'max_features': 0.6397749556435658}. Best is trial 1 with value: -0.8960000000000001.
[I 2020-09-02 16:41:34,514] Trial 3 finished with value: -0.8365 and parameters: {'criterion': 'gini', 'n_estimators': 1098, 'max_depth': 4, 'max_features': 0.6750073273149642}. Best is trial 1 with value: -0.8960000000000001.
[I 2020-09-02 16:42:37,463] Trial 4 finished with value: -0.8795 and p