In [9]:
%matplotlib inline
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import AdaBoostRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import nltk
import string

import re

pd.options.display.max_rows = 5
import nltk
from nltk.corpus import stopwords

In [21]:
def add_beer_class(df):
    remove = string.punctuation
    remove = remove.replace("-", "")
    pattern = "[{}]".format(remove)

    df.description = df.description.str.lower()
    df.description = df.description.str.replace(pattern,"")
    df.description = df.description.str.replace("\d","")

    beers = ["light","pale","hop","stout","porter","lager","malt","amber","blonde ale",
        "sweet","bitter","fruit","dark","ale","ipa","double","wheat","smooth"]
    for beer_type in beers:
        text = "{0}[^ ]*".format(beer_type)
        df[beer_type] = df["description"].str.contains(text,
                                                       regex=True,
                                                       flags=re.IGNORECASE)
    return df

def stepwise(df):
    MSE_per_col = pd.Series(index = df.columns)
    MSE_per_col.drop(["available","description","name","glass"],inplace=True)
    for features in df.columns:
        if (features not in ["available","description","name","glass"]):
            X_dict = pd.DataFrame(df[features]).to_dict(orient="records")
            y = df[output]
            MSE_per_col[features] = (np.mean(
                -cross_val_score(pipeline, X_dict, y.values.ravel(), cv=10, scoring="neg_mean_squared_error")
            ))
    return MSE_per_col

def eval_models(df,features_list):
    MSE_per_col = pd.Series(index= ["model{0}".format(i) for i in range(1,len(features_list)+1)])
    counter = 1
    for features in features_list:
        X_dict = pd.DataFrame(df[features]).to_dict(orient="records")
        y = df[output]
        MSE_per_col["model{0}".format(counter)] = (np.mean(
            -cross_val_score(pipeline, X_dict, y.values.ravel(), cv=10, scoring="neg_mean_squared_error")
        ))
        counter += 1
    return MSE_per_col

def random_cv(df,model):   
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]
    

    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

    max_features = ['auto', 'log2']

    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    min_samples_split = [2, 5, 10]

    min_samples_leaf = [1, 2, 4]

    bootstrap = [True, False]

    learning_rate = [0.5,1,2]
    loss = ['exponential','square']
    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate,
                   'loss': loss}
                   
    rf = AdaBoostRegressor(RandomForestRegressor(bootstrap= True,
                             max_depth= 10,
                             max_features= 'auto',
                             min_samples_leaf= 6, 
                             min_samples_split= 6,
                             n_estimators= 400),
                          n_estimators=300)
    rf_random = RandomizedSearchCV(estimator = rf, 
                                   param_distributions = random_grid,
                                   n_iter = 30, cv = 3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs = -1)
    rf_random.fit(train_features,train_labels)
    return rf_random.best_params_

def gridSearch(param_grid,model):
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]

    rf = RandomForestRegressor()

    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                              cv = 10, n_jobs = -1, verbose = 2)
    grid_search.fit(train_features, train_labels)
    return grid_search.best_params_

In [12]:
df = pd.read_csv("/Users/ramanyakkala/Downloads/beer_train.csv")
df_test = pd.read_csv("/Users/ramanyakkala/Downloads/beer_test.csv")
df.head()

Unnamed: 0,id,abv,available,description,glass,ibu,isOrganic,name,originalGravity,srm
0,0,8.2,"Available at the same time of year, every year.",A Belgian-Abbey-Style Tripel that is big in al...,,31.0,N,LoonyToonTripel,1.07,8
1,1,5.7,"Available at the same time of year, every year.",Covert Hops is a crafty ale. Its stealthy dark...,Pint,45.0,N,Covert Hops,1.056,35
2,2,5.8,"Available at the same time of year, every year.",This is a traditional German-style Marzen char...,Mug,25.0,N,Oktoberfest,1.048,10
3,3,5.5,Available year round as a staple beer.,A West Coast-Style Pale Ale balancing plenty o...,Pint,55.0,N,Pale Ale,1.044,5
4,4,4.8,Available year round as a staple beer.,This Bombshell has a tantalizing crisp and cle...,Pint,11.4,N,Head Turner Blonde Ale,1.045,3


In [13]:
df["description"] = df["description"].fillna("Missing")
df_test["description"] = df_test["description"].fillna("Missing")
df["ibu"] = df["ibu"].astype(int)

In [14]:
add_beer_class(df).head(1)
add_beer_class(df_test).head(1)

Unnamed: 0,id,abv,available,description,glass,ibu,isOrganic,name,originalGravity,srm,...,blonde ale,sweet,bitter,fruit,dark,ale,ipa,double,wheat,smooth
0,6000,10.0,Limited availability.,a classic belgian trappist style strong ale wi...,Tulip,,N,She WILL!,1.084,17,...,False,False,False,False,False,True,False,False,False,False


In [8]:
vec = DictVectorizer(sparse = False)
scaler = StandardScaler()
model = RandomForestRegressor(n_estimators = 100)
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])

output = ["ibu"]

stepwise = stepwise(df).sort_values()

In [9]:
stepwise.index

Index(['ibu', 'originalGravity', 'ipa', 'abv', 'srm', 'double', 'hop', 'wheat',
       'light', 'lager', 'bitter', 'sweet', 'ale', 'pale', 'blonde ale',
       'smooth', 'amber', 'porter', 'stout', 'fruit', 'dark', 'malt',
       'isOrganic', 'id'],
      dtype='object')

In [19]:
output = ["ibu"]

model1 =["abv","originalGravity","hop","ipa","srm"]
model2 =["abv","originalGravity","hop","ipa","srm","double"]
model3 =["abv","originalGravity","srm"]
model4 =["abv","originalGravity","hop"]
model5 =["abv","originalGravity","double"]
model6 =["abv","originalGravity","ipa"]
model7 =["abv","originalGravity","hop","ipa","wheat"]
model8 =["abv","originalGravity","hop","ipa"]

features_list = [model1,model2,model3,model4,model5,model6,model7,model8]


In [11]:
eval_model = eval_models(df,features_list)

In [12]:
eval_model.sort_values()

model1    488.919728
model2    499.259954
             ...    
model8    569.589359
model5    627.046831
Length: 8, dtype: float64

Model 1 has performed the best

In [22]:
best_params = random_cv(df,model1)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 213.1min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 556.0min finished


In [23]:
best_params

{'n_estimators': 400, 'loss': 'square', 'learning_rate': 1}

In [104]:
vec = DictVectorizer(sparse = False)
scaler = StandardScaler()
model = RandomForestRegressor(n_estimators=200,
                              min_samples_split= 5,
                              min_samples_leaf= 4,
                              max_features= 'auto',
                              max_depth= 10,
                             bootstrap= True)
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])

eval_models(df,[model1])

model1    439.282865
dtype: float64

In [19]:
param_grid = {'n_estimators': [225,150,250,275],
                              'min_samples_split': [4,5,6,7],
                              'min_samples_leaf': [4,5,6,7],
                              'max_features': ['auto'],
                              'max_depth': [8,10,12],
                              'bootstrap': [True]}
grid = gridSearch(param_grid,model1)

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 43.7min finished


In [25]:
grid

{'bootstrap': True,
 'max_depth': 12,
 'max_features': 'auto',
 'min_samples_leaf': 7,
 'min_samples_split': 7,
 'n_estimators': 150}

In [27]:
vec = DictVectorizer(sparse = False)
scalers = [StandardScaler(), MinMaxScaler(),RobustScaler(quantile_range=(25, 75)),Normalizer(),QuantileTransformer()]
model = AdaBoostRegressor(RandomForestRegressor(bootstrap= True,
                             max_depth= 10,
                             max_features= 'auto',
                             min_samples_leaf= 6, 
                             min_samples_split= 6,
                             n_estimators= 400),
                          n_estimators = 400, 
                          loss = 'square',
                          learning_rate = 1)
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])

for scaler in scalers:
    print(eval_models(df,[model1]))

model1    626.964416
dtype: float64
model1    458.346101
dtype: float64
model1    462.95106
dtype: float64
model1    459.443184
dtype: float64


KeyboardInterrupt: 

StandardScaler()

In [30]:
grid

{'bootstrap': True,
 'max_depth': 12,
 'max_features': 'auto',
 'min_samples_leaf': 7,
 'min_samples_split': 7,
 'n_estimators': 150}

In [24]:
X_train_dict = df[model1].to_dict(orient="records")
X_new_dict = df_test[model1].to_dict(orient="records")
y_train = df[output]

vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

scaler = QuantileTransformer()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

model = AdaBoostRegressor(RandomForestRegressor(bootstrap= True,
                             max_depth= 10,
                             max_features= 'auto',
                             min_samples_leaf= 6, 
                             min_samples_split= 6,
                             n_estimators= 400),
                          n_estimators = 400, 
                          loss = 'square',
                          learning_rate = 1)

#model = RandomForestRegressor(bootstrap= True,
#                             max_depth= 10,
#                             max_features= 'auto',
#                             min_samples_leaf= 6, 
#                             min_samples_split= 6,
#                             n_estimators= 400)

model.fit(X_train_sc, y_train.values.ravel())

df_output = pd.DataFrame(index=range(1,4754))
df_output["id"] = range(6000,10753)
df_output["ibu"] = model.predict(X_new_sc)

In [26]:
df_output.to_csv("submission.csv",index=False)

In [25]:
df_output.head()

Unnamed: 0,id,ibu
1,6000,38.645253
2,6001,28.593148
3,6002,23.384383
4,6003,84.412726
5,6004,36.060252
