In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest

pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', 300)
from sklearn.metrics import (precision_score,recall_score,
                             f1_score, accuracy_score,confusion_matrix, classification_report)

In [9]:
colorcode=pd.read_csv('../data/processed/color_code.csv',index_col=0)
train=pd.read_csv('../data/train/train.csv',index_col=0)
test=pd.read_csv('../data/test/test.csv',index_col=0)

In [10]:
X_train=train[['ABV','IBU','Color']]
X_test=test[['ABV','IBU','Color']]

y_train=train['Style_color']
y_test=test['Style_color']

In [11]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [None]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("classifier", RandomForestClassifier())
])

In [None]:
# log_params = {
#     'selectkbest__k':np.arange(5,15),
#     'classifier': [LogisticRegression()],
#     'classifier__C': [0.1,1,10]
# }
rf_params = {
    'scaler': [StandardScaler(), MinMaxScaler(),None],
    'selectkbest__k':np.arange(3,10),
    'classifier': [RandomForestClassifier()],
    'classifier__max_depth': [3,8,12]
}
gb_params = {
    'scaler': [StandardScaler(), MinMaxScaler(),None],
    'selectkbest__k':np.arange(5,15),
    'classifier': [GradientBoostingClassifier()],
    'classifier__max_depth': [3,6,9,12]
}
knn_params = {
    'selectkbest__k':np.arange(3,10),
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': np.arange(3,10)
}
svm_params = {
    'selectkbest__k':np.arange(3,10),
    'classifier': [SVC()],
    'classifier__C': [0.1,1,10]
}

search_space = [
    # log_params,
    rf_params,
    gb_params,
    knn_params,
    svm_params   
]

In [None]:
clf_gs = GridSearchCV(estimator=pipe, param_grid=search_space, cv=3, scoring=["f1",'precision','recall'], verbose=3, n_jobs=-1)

clf_gs.fit(X_resampled, y_resampled)

In [None]:
y_pred = pipe_gs_rf_.predict(X_test)

In [None]:
predictions = lr1.predict(X_test)
predicions_proba = lr1.predict_proba(X_test)
# predicions_proba

In [None]:

print('accuracy_score',accuracy_score(yc_test, predictions))
print('precision_score',precision_score(yc_test, predictions, average='weighted'))
print('f1_score',f1_score(yc_test, predictions, average='weighted'))
print('recall_score',recall_score(yc_test, predictions, average='weighted'))
# print(classification_report(yc_test, predictions))

In [None]:
cm = confusion_matrix(yc_test, predictions,normalize='true')
# print(cm)

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(cm, annot=True, linewidths=.5, square = True, cmap = 'YlOrBr')
plt.ylabel('Actual style')
plt.xlabel('Predicted style')
all_sample_title = (f'Accuracy: {accuracy_score(yc_test, predictions)}\nRecall:
                    {recall_score(yc_test, predictions, average="weighted")}\nF1:
                    {f1_score(yc_test, predictions, average="weighted")}')
plt.title(all_sample_title, size = 10);
plt.xticks(ticks=range(len(colorcode.index)), labels=colorcode['Style'], rotation=55,fontsize=8);
plt.yticks(ticks=range(len(colorcode.index)), labels=colorcode['Style'],rotation=25,fontsize=8);

In [None]:
import pickle
# with open('../models/lr1', 'wb') as output:
#     pickle.dump(lr1, output)

# with open('../models/lr1', 'rb') as input:
#     modelo_importado = pickle.load(input) 