In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
import warnings
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore")
import os
import yaml
import pickle

pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', 300)
from sklearn.metrics import (precision_score,recall_score,
                             f1_score, accuracy_score,confusion_matrix, classification_report)

In [2]:
colorcode=pd.read_csv('../data/processed/color_code.csv',index_col=0)
train=pd.read_csv('../data/train/train.csv',index_col=0)
test=pd.read_csv('../data/test/test.csv',index_col=0)

In [3]:
X_train=train[['ABV','IBU','Color']]
X_test=test[['ABV','IBU','Color']]

y_train=train['Style_color']
y_test=test['Style_color']

In [4]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [5]:
from sklearn.compose import ColumnTransformer
col_toscale=['ABV', 'IBU', 'Color']
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', MinMaxScaler(), col_toscale)],
    remainder='passthrough')

In [None]:
pipeScale = Pipeline(steps=[
    # ('preprocessor',preprocessor),
    # ("selectkbest", SelectKBest(k=3)),
    ("classifier", #RandomForestClassifier())
])

In [None]:
log_params = {
    # 'selectkbest__k':np.arange(1,4),
    # 'classifier': [LogisticRegression()],
    # 'classifier__C': [0.1,1,10]
}
rf_params = {
    # 'selectkbest__k':np.arange(1,4),
    'classifier': [RandomForestClassifier()],
    'classifier__max_depth': [3,5,7]
}
gb_params = {
    # 'selectkbest__k':np.arange(1,4),
    'classifier': [GradientBoostingClassifier()],
    'classifier__max_depth':[3,5,7]
}
knn_params = {
    # 'selectkbest__k':np.arange(1,4),
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': np.arange(1,10)
}
svm_params = {
    # 'selectkbest__k':np.arange(1,4),
    'classifier': [SVC()],
    'classifier__C': [0.1,1,10]
}

search_space = [
    # {'preprocessor__scaler__feature_range': [(0, 1)]},   #remove for gs2
    # log_params,
    rf_params,
    gb_params,
    knn_params,
    svm_params   
]

In [None]:

scoring={'f1_weighted': 'f1_weighted',
        'precision_weighted': 'precision_weighted',
        'recall_weighted': 'recall_weighted'}
clf_gs = GridSearchCV(estimator=pipeScale, param_grid=search_space, cv=5, scoring=scoring, refit='f1_weighted',verbose=3, n_jobs=-1,error_score='raise')

In [None]:
clf_gs.fit(X_resampled, y_resampled)

In [None]:
print(clf_gs.best_estimator_)
print(clf_gs.best_score_)
print(clf_gs.best_params_)

In [None]:
# y_predScale = clf_gs.best_estimator_.predict(X_test)
y_pred = clf_gs.best_estimator_.predict(X_test)

In [None]:

print('accuracy_score',accuracy_score(y_pred,y_test))
print('precision_score',precision_score(y_pred,y_test, average='weighted'))
print('f1_score',f1_score(y_pred,y_test, average='weighted'))
print('recall_score',recall_score(y_pred,y_test, average='weighted'))
# print(classification_report(yc_test, predictions))