In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import average_precision_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import random

In [6]:
DATA_SET = pd.read_csv("IPA.csv")
DATA_SET.head()

Unnamed: 0,IsIPA,OG,FG,ABV,IBU,Color,BoilTime,BoilGravity,PitchRate,Efficiency,UserId
0,False,1.069,1.007,8.12,0.0,30.48,60,,,75.0,
1,False,1.064,1.012,6.8,9.36,9.85,60,1.132,0.5,35.0,
2,False,1.061,1.015,6.08,28.31,35.83,60,1.044,0.35,83.0,42087.0
3,False,1.053,1.012,5.44,46.48,5.77,60,1.033,,70.0,
4,False,1.053,1.017,4.64,42.29,4.22,90,1.039,0.5,77.0,14729.0


In [5]:
DATA_SET.isna().sum()

IsIPA              0
OG                 0
FG                 0
ABV                0
IBU                0
Color              0
BoilTime           0
BoilGravity     1328
PitchRate      19645
Efficiency         0
UserId         25448
dtype: int64

In [8]:
DATA_SET = DATA_SET.drop(['UserId'],axis=1)

In [9]:
DATA_SET = DATA_SET.fillna(DATA_SET.mean())

In [13]:
DATA_SET.isna().sum()

IsIPA          0
OG             0
FG             0
ABV            0
IBU            0
Color          0
BoilTime       0
BoilGravity    0
PitchRate      0
Efficiency     0
dtype: int64

In [14]:
X = DATA_SET.drop('IsIPA',axis=1)
y = DATA_SET['IsIPA']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1)

In [235]:
# SVC

In [21]:
svm = SVC()
param_grid = {'C':[0.01,0.1,0.5,1], 
              'kernel':['rbf', 'linear'], 
              'gamma':[0.1, 0.01, 0.001]}

In [22]:
grid_SVM = GridSearchCV(svm, param_grid, verbose = 2, scoring='f1_micro', cv=3)

In [23]:
grid_SVM.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ........................C=0.05, gamma=1, kernel=rbf; total time= 1.3min
[CV] END ........................C=0.05, gamma=1, kernel=rbf; total time= 1.3min
[CV] END ........................C=0.05, gamma=1, kernel=rbf; total time= 1.3min
[CV] END ........................C=0.05, gamma=1, kernel=rbf; total time= 1.5min
[CV] END ........................C=0.05, gamma=1, kernel=rbf; total time= 1.5min
[CV] END ......................C=0.05, gamma=0.1, kernel=rbf; total time=  34.3s
[CV] END ......................C=0.05, gamma=0.1, kernel=rbf; total time=  33.9s
[CV] END ......................C=0.05, gamma=0.1, kernel=rbf; total time=  31.3s
[CV] END ......................C=0.05, gamma=0.1, kernel=rbf; total time=  32.8s
[CV] END ......................C=0.05, gamma=0.1, kernel=rbf; total time=  36.5s
[CV] END .....................C=0.05, gamma=0.01, kernel=rbf; total time=  18.4s
[CV] END .....................C=0.05, gamma=0.0

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.05, 0.01, 0.1, 0.5, 1],
                         'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']},
             scoring='f1_micro', verbose=2)

In [24]:
grid_SVM.best_params_

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

In [25]:
pred_SVM = grid_SVM.predict(X_test)

In [26]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_SVM).ravel()

specificity = tn / (tn+fp)
accuracy = accuracy_score(y_test, pred_SVM)
precision = precision_score(y_test, pred_SVM)
recall = recall_score(y_test, pred_SVM)
mer = 1- accuracy
f1 = f1_score(y_test, pred_SVM)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("MER: {}".format(mer))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1))
print(f"Specificity: {specificity:.2%}")

Accuracy: 0.851081081081081
Precision: 0.7852816474863719
MER: 0.14891891891891895
Recall: 0.7331071529544811
F1 score: 0.7582979967831555
Specificity: 90.63%


In [242]:
# AdaBoost

In [44]:
ADA = AdaBoostClassifier()
param_grid = {
    'n_estimators': range(50, 200, 20),
    'learning_rate': [0.1, 0.3, 0.5, 0.7, 1],
  }

In [45]:
grid_ADA = GridSearchCV(estimator=ADA, param_grid=param_grid, cv=3, verbose=2, scoring='f1_micro')
grid_ADA.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   1.3s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   1.3s
[CV] END .................learning_rate=0.1, n_estimators=50; total time=   1.2s
[CV] END .................learning_rate=0.1, n_estimators=70; total time=   1.7s
[CV] END .................learning_rate=0.1, n_estimators=70; total time=   1.7s
[CV] END .................learning_rate=0.1, n_estimators=70; total time=   1.6s
[CV] END .................learning_rate=0.1, n_estimators=90; total time=   2.2s
[CV] END .................learning_rate=0.1, n_estimators=90; total time=   2.2s
[CV] END .................learning_rate=0.1, n_estimators=90; total time=   2.2s
[CV] END ................learning_rate=0.1, n_estimators=110; total time=   2.7s
[CV] END ................learning_rate=0.1, n_estimators=110; total time=   2.8s
[CV] END ................learning_rate=0.1, n_e

[CV] END ...................learning_rate=1, n_estimators=70; total time=   1.8s
[CV] END ...................learning_rate=1, n_estimators=90; total time=   2.5s
[CV] END ...................learning_rate=1, n_estimators=90; total time=   2.2s
[CV] END ...................learning_rate=1, n_estimators=90; total time=   2.2s
[CV] END ..................learning_rate=1, n_estimators=110; total time=   2.7s
[CV] END ..................learning_rate=1, n_estimators=110; total time=   2.7s
[CV] END ..................learning_rate=1, n_estimators=110; total time=   2.6s
[CV] END ..................learning_rate=1, n_estimators=130; total time=   3.2s
[CV] END ..................learning_rate=1, n_estimators=130; total time=   3.2s
[CV] END ..................learning_rate=1, n_estimators=130; total time=   3.1s
[CV] END ..................learning_rate=1, n_estimators=150; total time=   3.7s
[CV] END ..................learning_rate=1, n_estimators=150; total time=   3.8s
[CV] END ..................l

GridSearchCV(cv=3, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.1, 0.3, 0.5, 0.7, 1],
                         'n_estimators': range(50, 200, 20)},
             scoring='f1_micro', verbose=2)

In [46]:
grid_ADA.best_params_

{'learning_rate': 0.3, 'n_estimators': 170}

In [47]:
pred_ADA = grid_ADA.predict(X_test)

In [48]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_ADA).ravel()

specificity = tn / (tn+fp)
accuracy = accuracy_score(y_test, pred_ADA)
precision = precision_score(y_test, pred_ADA)
recall = recall_score(y_test, pred_ADA)
mer = 1- accuracy
f1 = f1_score(y_test, pred_ADA)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("MER: {}".format(mer))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1))
print(f"Specificity: {specificity:.2%}")

Accuracy: 0.86
Precision: 0.7953529937444147
MER: 0.14
Recall: 0.7548770144189991
F1 score: 0.774586597040905
Specificity: 90.92%


In [248]:
# Random Forest

In [90]:
RF = RandomForestClassifier()
grid = {
    'n_estimators': [200, 250, 300, 350, 400, 500],
    'max_features': np.linspace(1, X_train.shape[1], 5).astype(int),
    'max_depth': [10, 20, 30, 40]
}

In [91]:
tuning_res_rf = GridSearchCV(RF,
                             param_grid=grid,
                             cv=3,
                             verbose=2,
                             scoring='f1_micro')

In [92]:
tuning_res_rf.fit(X_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[CV] END .....max_depth=10, max_features=1, n_estimators=200; total time=   1.5s
[CV] END .....max_depth=10, max_features=1, n_estimators=200; total time=   1.5s
[CV] END .....max_depth=10, max_features=1, n_estimators=200; total time=   1.5s
[CV] END .....max_depth=10, max_features=1, n_estimators=250; total time=   1.8s
[CV] END .....max_depth=10, max_features=1, n_estimators=250; total time=   1.9s
[CV] END .....max_depth=10, max_features=1, n_estimators=250; total time=   1.9s
[CV] END .....max_depth=10, max_features=1, n_estimators=300; total time=   2.5s
[CV] END .....max_depth=10, max_features=1, n_estimators=300; total time=   2.4s


KeyboardInterrupt: 

In [52]:
tuning_res_rf.best_params_

{'max_depth': 10, 'max_features': 5, 'n_estimators': 300}

In [53]:
pred_RF = tuning_res_rf.predict(X_test)

In [54]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_RF).ravel()

specificity = tn / (tn+fp)
accuracy = accuracy_score(y_test, pred_RF)
precision = precision_score(y_test, pred_RF)
recall = recall_score(y_test, pred_RF)
mer = 1- accuracy
f1 = f1_score(y_test, pred_RF)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("MER: {}".format(mer))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1))
print(f"Specificity: {specificity:.2%}")

Accuracy: 0.8648648648648649
Precision: 0.8017777777777778
MER: 0.1351351351351351
Recall: 0.7650551314673452
F1 score: 0.7829861111111112
Specificity: 91.15%


In [None]:
# Import and predict 

In [68]:
DATA = pd.read_csv("IPA_test.csv")
DATA = DATA.drop(['UserId'],axis=1)
DATA.isna().sum()

OG                0
FG                0
ABV               0
IBU               0
Color             0
BoilTime          0
BoilGravity     156
PitchRate      2622
Efficiency        0
dtype: int64

In [69]:
DATA = DATA.fillna(DATA.mean())

In [71]:
DATA.isna().sum()

OG             0
FG             0
ABV            0
IBU            0
Color          0
BoilTime       0
BoilGravity    0
PitchRate      0
Efficiency     0
dtype: int64

In [89]:
tuning_res_rf.fit(X, y) #This should be changed to the best model according to F1-score performance

Fitting 3 folds for each of 45 candidates, totalling 135 fits
[CV] END .....max_depth=10, max_features=1, n_estimators=200; total time=   2.2s
[CV] END .....max_depth=10, max_features=1, n_estimators=200; total time=   2.1s
[CV] END .....max_depth=10, max_features=1, n_estimators=200; total time=   2.1s
[CV] END .....max_depth=10, max_features=1, n_estimators=300; total time=   3.2s
[CV] END .....max_depth=10, max_features=1, n_estimators=300; total time=   3.1s
[CV] END .....max_depth=10, max_features=1, n_estimators=300; total time=   3.2s
[CV] END .....max_depth=10, max_features=1, n_estimators=400; total time=   4.6s
[CV] END .....max_depth=10, max_features=1, n_estimators=400; total time=   4.3s
[CV] END .....max_depth=10, max_features=1, n_estimators=400; total time=   4.2s


KeyboardInterrupt: 

In [72]:
tuning_res_rf.best_params_

{'max_depth': 10, 'max_features': 3, 'n_estimators': 400}

In [73]:
pred_final = tuning_res_rf.predict(DATA) # This as well

In [74]:
len(pred_final)

5000

In [None]:
# OUTPUTTING

In [83]:
OUT = pd.DataFrame(pred_final, columns=['Prediction'])

In [85]:
#np.savetxt("1SLAVS_IPA_prediction.csv", OUT, delimiter=",")

In [86]:
OUT.to_csv(r"2SLAVS_IPA_prediction.csv")