# Random Forest w/ IBU Kaggle Compest

## Author: Anish Yakkala

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import nltk
import string

import re

pd.options.display.max_rows = 5
import nltk
from nltk.corpus import stopwords

In [2]:
def add_beer_class(df):
    remove = string.punctuation
    remove = remove.replace("-", "")
    pattern = "[{}]".format(remove)

    df.description = df.description.str.lower()
    df.description = df.description.str.replace(pattern,"")
    df.description = df.description.str.replace("\d","")
    
    df.name = df.name.str.lower()
    df.name = df.name.str.replace(pattern,"")


    beers = ["light","pale","hop","malt","light","imperial",
        "sweet","bitter","citrus","dark","ale",
             "ipa","double","wheat","beer","style",
            "lager","big","lambic","pilsner",
             "porter","stout","barleywine"]
    for beer_type in beers:
        text = "{0}[^ ]*".format(beer_type)
        df[beer_type] = df["description"].str.contains(text,
                                                       regex=True,
                                                       flags=re.IGNORECASE)
        df[beer_type] = df[beer_type].astype(int)
    bigrams = [("wheat", "beer"),("wheat", "ale"),("wheat", "ale"),
               ("pale", "ale"),("brown", "ale"),("double", "ipa")
              ,("noble", "hops"),("ipa", "brewed"),("west", "coast"),
               ("dry", "hopped"),("american","lager"),
              ("barley","wine"),("pale","lager"),("american","ipa"),
               ("imperial","ipa"),("imperial","stout")]
    
    for beer_type in bigrams:
        bi = set([beer_type])
        is_bi = bi.issubset
        text = "_".join(beer_type)
        df[text] = [is_bi(l) for l in df.bigrams.values.tolist()]
        df[text] = df[text].astype(int)
    tri = set([("india","pale", "ale")])
    is_tri = tri.issubset
    df["india_pale_ale"] = ([is_tri(l) for l in df.trigrams.
                             values.
                             tolist()])
    
    df["india_pale_ale"] = df["india_pale_ale"].astype(int)
    df["ipa_name"] = df["name"].str.contains("ipa")
    df["ipa_name"] = df["ipa_name"].astype(int)
    df["imperial_name"] = df["name"].str.contains("imperial")
    df["imperial_name"] = df["imperial_name"].astype(int)
    df["wheat_name"] = df["name"].str.contains("wheat")
    df["wheat_name"] = df["wheat_name"].astype(int)
    return df

def stepwise(df):
    MSE_per_col = pd.Series(index = df.columns)
    MSE_per_col.drop(["description","name","glass","bigrams","trigrams"],
                     inplace=True)
    for features in df.columns:
        if (features not in ["description","name","glass","bigrams","trigrams"]):
            X_dict = pd.DataFrame(df[features]).to_dict(orient="records")
            y = df[output]
            MSE_per_col[features] = (np.mean(
                -cross_val_score(pipeline, X_dict,
                                 y.values.ravel(),
                                 cv=10,
                                 scoring="neg_mean_squared_error")
            ))
    return MSE_per_col

def eval_models(df,features_list, cv = 10):
    MSE_per_col = pd.Series(index= ["model{0}".format(i) for i in range(1,len(features_list)+1)])
    counter = 1
    for features in features_list:
        X_dict = pd.DataFrame(df[features]).to_dict(orient="records")
        y = df["ibu"]
        MSE_per_col["model{0}".format(counter)] = (np.mean(
            -cross_val_score(pipeline,
                             X_dict,
                             y.values.ravel(),
                             cv = cv, scoring = "neg_mean_squared_error")
        ))
        counter += 1
    return MSE_per_col

def random_cv(df,model):   
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]
    scaler = QuantileTransformer()
    scaler.fit(train_features)
    train_features = scaler.transform(train_features)    

    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

    max_features = ['auto', 'log2','sqrt']

    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    min_samples_split = [2, 5, 10]

    min_samples_leaf = [1, 2, 4]

    bootstrap = [True, False]

    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    rf = RandomForestRegressor()
    rf_random = RandomizedSearchCV(estimator = rf,
                                   param_distributions = random_grid,
                                   n_iter = 100,
                                   cv = 3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs = -1)
    
    rf_random.fit(train_features,train_labels)
    return rf_random.best_params_

def gridSearch(param_grid,model):
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]
    scaler = QuantileTransformer()
    scaler.fit(train_features)
    train_features = scaler.transform(train_features)

    rf = RandomForestRegressor()

    grid_search = GridSearchCV(estimator = rf, 
                               param_grid = param_grid, 
                               cv = 5, 
                               n_jobs = -1, 
                               verbose = 2)
    grid_search.fit(train_features, train_labels)
    return grid_search.best_params_

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc,arg = None):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = ([token for token in tokens if 
                        token not in stop_words])
    # re-create document from filtered tokens
    if arg == "bigrams":
        return get_bigrams(filtered_tokens)
    if arg == "trigrams":
        return get_trigrams(filtered_tokens)
    doc = ' '.join(filtered_tokens)
    return doc

def get_trigrams(words):
    return list(zip(words[:-2], words[1:-1],words[2:]))

def get_bigrams(words):
    return list(zip(words[:-2], words[1:-1]))

def predict_spam(new_text,k=30):
    # Get the TF-IDF vector for the new text.
    x_new = vec.transform([new_text])[0, :]
    dot = x_new.multiply(X_train).sum(axis=1)
    x_new_len = np.sqrt(x_new.multiply(x_new).sum())
    denom = x_new_len * X_train_len
    cos_similarities = pd.DataFrame(dot / (denom))[0]
    classif = y_train[(cos_similarities.
                       sort_values(ascending=False)[:k].
                       index)].mean()
    return classif

def test_k_tf(feature,k,do_gram = False):
    if do_gram:
        vec = TfidfVectorizer(norm=False,
                              ngram_range=(2,2),
                              stop_words='english')
    else:
        vec = TfidfVectorizer(norm=False)
    y_train = df["ibu"]
    scaler = Normalizer()
    model = KNeighborsRegressor(n_neighbors=k,
                                metric='euclidean')
    pipeline = Pipeline([("vectorizer",vec),
                         ("scaler",scaler),
                         ("fit",model)])
    return(np.mean(
            -cross_val_score(pipeline,
                             df[feature],
                             y_train.values.ravel(), 
                             cv=5, scoring="neg_mean_squared_error")
        ))

In [None]:
df = pd.read_csv("/Users/ramanyakkala/Downloads/beer_train.csv")
df_test = pd.read_csv("/Users/ramanyakkala/Downloads/beer_test.csv")

In [7]:
df["description"] = df["description"].fillna("Missing")
df_test["description"] = df_test["description"].fillna("Missing")

df["name"] = df["name"].fillna("Missing")
df_test["name"] = df_test["name"].fillna("Missing")

In [8]:
df["bigrams"] = (df["description"].
                 apply(lambda x: normalize_document(x,
                                                    arg="bigrams")))
df["trigrams"] = (df["description"].
                  apply(lambda x: normalize_document(x,
                                                     arg="trigrams")))

df_test["bigrams"] = (df_test["description"].
                      apply(lambda x: normalize_document(x,
                                                         arg="bigrams")))
df_test["trigrams"] = (df_test["description"].
                       apply(lambda x: normalize_document(x,
                                                          arg="trigrams")))



In [None]:
add_beer_class(df)
add_beer_class(df_test)

In [12]:
vec = TfidfVectorizer(norm=False)
vec.fit(df["description"])
X_train = vec.transform(df["description"])
X_train_len = np.sqrt(X_train.multiply(X_train).sum(axis=1))
y_train = df["ibu"]
df["predicted_ibu"] = df["description"].apply(lambda x: 
                                              predict_spam(x,30))

vec = TfidfVectorizer(norm=False)
vec.fit(df["name"])
X_train = vec.transform(df["name"])
X_train_len = np.sqrt(X_train.multiply(X_train).sum(axis=1))
y_train = df["ibu"]
df["predicted_ibu_name"] = df["name"].apply(lambda x: 
                                            predict_spam(x,30))

vec = TfidfVectorizer(norm=False,ngram_range=(2,2))
vec.fit(df["description"])
X_train = vec.transform(df["description"])
X_train_len = np.sqrt(X_train.multiply(X_train).sum(axis=1))
y_train = df["ibu"]
df["predicted_ibu_ngram"] = df["description"].apply(lambda x: 
                                                    predict_spam(x,30))

In [13]:
vec = DictVectorizer(sparse = False)
scaler = QuantileTransformer()
model = RandomForestRegressor(n_estimators = 100)
pipeline = Pipeline([("vectorizer",vec),
                     ("scaler",scaler),
                     ("fit",model)])

output = ["ibu"]

stepwises = stepwise(df).sort_values()

stepwises.index


Index(['ibu', 'originalGravity', 'predicted_ibu', 'predicted_ibu_ngram', 'ipa',
       'abv', 'ipa_name', 'predicted_ibu_name', 'srm', 'double', 'hop',
       'double_ipa', 'imperial', 'imperial_ipa', 'india_pale_ale', 'wheat',
       'citrus', 'light', 'big', 'lager', 'imperial_name', 'available',
       'ipa_brewed', 'wheat_beer', 'wheat_name', 'bitter', 'pale_ale', 'style',
       'dry_hopped', 'west_coast', 'beer', 'wheat_ale', 'barleywine',
       'imperial_stout', 'barley_wine', 'american_ipa', 'sweet', 'ale', 'pale',
       'pilsner', 'noble_hops', 'brown_ale', 'porter', 'stout',
       'american_lager', 'lambic', 'pale_lager', 'dark', 'malt', 'isOrganic',
       'id'],
      dtype='object')

In [17]:
scaler = QuantileTransformer()
output = ["ibu"]

model8 = ['originalGravity','predicted_ibu_ngram',
          'predicted_ibu_name', 'predicted_ibu', 'ipa', 'ipa_name', 'abv', 'srm',
          'double', 'hop', 'imperial_name', 'imperial', 'wheat', 'wheat_name',
          'light', 'citrus', 'big', 'lager', 'bitter', 'style', 'beer', 'sweet', 'ale', 'pale']

model9 = ['originalGravity', 'predicted_ibu', 'predicted_ibu_ngram', 'ipa',
       'abv', 'ipa_name', 'predicted_ibu_name', 'srm', 'double', 'hop',
       'double_ipa', 'imperial', 'imperial_ipa', 'india_pale_ale', 'wheat',
       'citrus', 'light', 'big', 'lager', 'imperial_name', 'available',
       'ipa_brewed', 'wheat_beer', 'wheat_name', 'bitter', 'pale_ale', 'style',
       'dry_hopped', 'west_coast', 'beer', 'wheat_ale', 'barleywine',
       'imperial_stout', 'barley_wine']


model11 = ['originalGravity', 'predicted_ibu', 'predicted_ibu_ngram', 'ipa',
       'abv', 'ipa_name', 'predicted_ibu_name', 'srm', 'double', 'hop',
       'double_ipa', 'imperial', 'imperial_ipa', 'india_pale_ale', 'wheat',
       'citrus', 'light', 'big', 'lager', 'imperial_name', 'available',
       'ipa_brewed', 'wheat_beer', 'wheat_name', 'bitter', 'pale_ale', 'style',
       'dry_hopped', 'west_coast', 'beer', 'wheat_ale', 'barleywine',
       'imperial_stout', 'barley_wine', 'american_ipa', 'sweet', 'ale', 'pale',
       'pilsner', 'noble_hops', 'brown_ale'] 

model12 = ['originalGravity', 'predicted_ibu', 'predicted_ibu_ngram', 'ipa',
       'abv', 'ipa_name', 'predicted_ibu_name', 'srm', 'double', 'hop',
       'double_ipa', 'imperial', 'imperial_ipa', 'india_pale_ale', 'wheat',
       'citrus', 'light', 'big', 'lager', 'imperial_name', 'available',
       'ipa_brewed', 'wheat_beer', 'wheat_name', 'bitter', 'pale_ale', 'style',
       'dry_hopped', 'west_coast', 'beer', 'wheat_ale', 'barleywine',
       'imperial_stout', 'barley_wine', 'american_ipa', 'sweet', 'ale', 'pale',
       'pilsner', 'noble_hops', 'brown_ale', 'porter', 'stout',
       'american_lager']

features_list = [model8,model9,model11,model12]

eval_model = eval_models(df,features_list)

eval_model.sort_values()



model3    381.544940
model2    383.751835
model4    383.955754
model1    395.592970
dtype: float64

In [20]:
vec = DictVectorizer(sparse=False)
scaler = QuantileTransformer()
model = KNeighborsRegressor(n_neighbors=40,metric='manhattan')
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])

model8 = ['originalGravity', 'predicted_ibu', 
          'predicted_ibu_ngram', 'ipa',
          'abv', 'ipa_name', 'predicted_ibu_name']

model9 = ['originalGravity', 'ipa', 'predicted_ibu',
          'ipa_name', 'abv',
       '  predicted_ibu_name','predicted_ibu_ngram']


model10 = ['originalGravity', 'predicted_ibu', 'predicted_ibu_ngram',
           'ipa','abv', 'ipa_name', 'predicted_ibu_name',
           'srm', 'double', 'hop','double_ipa', 'imperial',
           'imperial_ipa']

model11 = ['originalGravity', 'predicted_ibu', 'predicted_ibu_ngram',
           'ipa','predicted_ibu_name', 'abv',
           'ipa_name', 'double']

features_list = [model8,model9,model10,model11]
output = ["ibu"]

for features in features_list:
    X_dict = pd.DataFrame(df[features]).to_dict(orient="records")
    y = df[output]
    print((np.mean(
        -cross_val_score(pipeline, X_dict, y.values.ravel(), cv=10,
                         scoring="neg_mean_squared_error")
    )))

375.8438802395352
375.8438802395352
428.63465493584846
379.91109906940176


In [21]:
features = ['originalGravity', 'ipa', 'predicted_ibu', 
            'ipa_name', 'abv',
            'predicted_ibu_name','predicted_ibu_ngram']

output = ["ibu"]

X_train_dict = df[features].to_dict(orient="records")
X_new_dict = df[features].to_dict(orient="records")
y_train = df[output]

vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

scaler = QuantileTransformer()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

model = KNeighborsRegressor(n_neighbors=40,metric='manhattan')
model.fit(X_train_sc, y_train)

df["KNN_predict"] = model.predict(X_new_sc)



In [22]:
vec = DictVectorizer(sparse = False)
scaler = QuantileTransformer()
model = RandomForestRegressor(n_estimators = 100)
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])
output = ["ibu"]

model7 = ['originalGravity', 'predicted_ibu', 'predicted_ibu_ngram', 'ipa',
       'abv', 'ipa_name', 'predicted_ibu_name', 'srm', 'double', 'hop',
       'double_ipa', 'imperial', 'imperial_ipa', 'india_pale_ale', 'wheat',
       'citrus', 'light', 'big', 'lager', 'imperial_name', 'available',
       'ipa_brewed', 'wheat_beer', 'wheat_name', 'bitter', 'pale_ale', 'style',
       'dry_hopped', 'west_coast', 'beer', 'wheat_ale', 'barleywine',
       'imperial_stout', 'barley_wine', 'american_ipa', 'sweet', 'ale', 'pale',
       'pilsner', 'noble_hops', 'brown_ale'] 

model8 = ['originalGravity', 'predicted_ibu','KNN_predict', 'predicted_ibu_ngram', 'ipa',
       'abv', 'ipa_name', 'predicted_ibu_name', 'srm', 'double', 'hop',
       'double_ipa', 'imperial', 'imperial_ipa', 'india_pale_ale', 'wheat',
       'citrus', 'light', 'big', 'lager', 'imperial_name', 'available',
       'ipa_brewed', 'wheat_beer', 'wheat_name', 'bitter', 'pale_ale', 'style',
       'dry_hopped', 'west_coast', 'beer', 'wheat_ale', 'barleywine',
       'imperial_stout', 'barley_wine', 'american_ipa', 'sweet', 'ale', 'pale',
       'pilsner', 'noble_hops', 'brown_ale'] 


features_list = [model7,model8]

eval_model = eval_models(df,features_list)

eval_model.sort_values()

model2    375.791049
model1    392.268934
dtype: float64

Model 8 has performed the best

Now I'll do random cv to find better values for our hyper paramaters

In [23]:
model8 = ['originalGravity', 'predicted_ibu','KNN_predict',
          'predicted_ibu_ngram',
          'ipa','abv', 'ipa_name', 'predicted_ibu_name',
          'srm', 'double', 'hop',
          'double_ipa', 'imperial', 'imperial_ipa', 'india_pale_ale',
          'wheat','citrus', 'light',
          'big', 'lager', 'imperial_name',
          'available','ipa_brewed', 'wheat_beer',
          'wheat_name', 'bitter', 'pale_ale', 'style',
          'dry_hopped', 'west_coast', 'beer',
          'wheat_ale', 'barleywine','imperial_stout',
          'barley_wine', 'american_ipa', 'sweet',
          'ale', 'pale','pilsner',
          'noble_hops', 'brown_ale'] 

In [27]:
best_params = random_cv(df,model8)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 43.7min finished


In [28]:
best_params


{'n_estimators': 1400,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

Let's see how that does

In [35]:
vec = DictVectorizer(sparse = False)
scaler = QuantileTransformer()
model = RandomForestRegressor(n_estimators=1400,
                              min_samples_split= 5,
                              min_samples_leaf= 1,
                              max_features= 'sqrt',
                              max_depth= None,
                             bootstrap= True)
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])

eval_models(df,[model8])



model1    354.632622
dtype: float64

Now let's do grid search and focus in on what values will be perfect

In [30]:
param_grid = {'n_estimators': [1300,1400,1500],
                              'min_samples_split': [4,5,6],
                              'min_samples_leaf': [1,2,3],
                              'max_features': ['sqrt'],
                              'max_depth': [None,5,10],
                              'bootstrap': [True]}
grid = gridSearch(param_grid,model8)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 27.5min finished


In [31]:
grid


{'bootstrap': True,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 1400}

Let's see how that does

In [38]:
vec = DictVectorizer(sparse = False)
scaler = StandardScaler()
model = RandomForestRegressor(n_estimators=1400,
                              min_samples_split= 6,
                              min_samples_leaf= 2,
                              max_features= 'sqrt',
                              max_depth= None,
                             bootstrap= True)
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])

eval_models(df,[model8])

model1    354.074357
dtype: float64

Now let's try to find the best scalar

In [37]:
vec = DictVectorizer(sparse = False)
scalers = [StandardScaler(), 
           MinMaxScaler(),
           RobustScaler(quantile_range=(25, 75)),
           MaxAbsScaler(),
           QuantileTransformer()]

model = RandomForestRegressor(n_estimators=1400,
                              min_samples_split= 6,
                              min_samples_leaf= 2,
                              max_features= 'sqrt',
                              max_depth= None,
                             bootstrap= True)
pipeline = Pipeline([("vectorizer",vec),
                     ("scaler",scaler),
                     ("fit",model)])

for scaler in scalers:
    pipeline = Pipeline([("vectorizer",vec),
                         ("scaler",scaler),
                         ("fit",model)])
    print(eval_models(df,[model8]))
    
    

model1    354.216207
dtype: float64
model1    354.974134
dtype: float64
model1    354.398881
dtype: float64
model1    354.625004
dtype: float64
model1    354.485805
dtype: float64


StandardScaler is best

Now lets add the remaning features to our test set

In [39]:
vec = TfidfVectorizer(norm=False)
vec.fit(df["description"])
X_train = vec.transform(df["description"])
X_train_len = np.sqrt(X_train.multiply(X_train).sum(axis=1))
y_train = df["ibu"]
df_test["predicted_ibu"] = (df_test["description"].
                            apply(predict_spam))

vec = TfidfVectorizer(norm=False,ngram_range=(2,2))
vec.fit(df["description"])
X_train = vec.transform(df["description"])
X_train_len = np.sqrt(X_train.multiply(X_train).sum(axis=1))
y_train = df["ibu"]
df_test["predicted_ibu_ngram"] = (df_test["description"].
                                  apply(predict_spam))

vec = TfidfVectorizer(norm=False)
vec.fit(df["name"])
X_train = vec.transform(df["name"])
X_train_len = np.sqrt(X_train.multiply(X_train).sum(axis=1))
y_train = df["ibu"]
df_test["predicted_ibu_name"] = (df_test["name"].
                                 apply(predict_spam))

In [40]:
features = ['originalGravity', 'ipa', 'predicted_ibu',
            'ipa_name', 'abv','predicted_ibu_name']


output = ["ibu"]

X_train_dict = df[features].to_dict(orient="records")
X_new_dict = df_test[features].to_dict(orient="records")
y_train = df[output]

vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

scaler = QuantileTransformer()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

model = KNeighborsRegressor(n_neighbors=40,metric='manhattan')
model.fit(X_train_sc, y_train)

df_test["KNN_predict"] = model.predict(X_new_sc)

Let's make the prediction

In [41]:
X_train_dict = df[model8].to_dict(orient="records")
X_new_dict = df_test[model8].to_dict(orient="records")
y_train = df[output]

vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

model = RandomForestRegressor(n_estimators=1400,
                              min_samples_split= 6,
                              min_samples_leaf= 2,
                              max_features= 'sqrt',
                              max_depth= None,
                             bootstrap= True)

model.fit(X_train_sc, y_train.values.ravel())

df_output = pd.DataFrame(index=range(1,4754))
df_output["id"] = range(6000,10753)
df_output["ibu"] = model.predict(X_new_sc)

Let's read in my best score to average the score as an ensemble method

In [45]:
average_vals = pd.read_csv("average.csv")

In [46]:
average_vals

Unnamed: 0,id,ibu
0,6000,39.359282
1,6001,38.821305
...,...,...
4751,10751,20.411814
4752,10752,54.581749


In [47]:
average_vals.index = average_vals.index + 1

In [50]:
df_output["ibu"] = (df_output["ibu"] + average_vals["ibu"])/2

In [52]:
df_output.to_csv("submission_nine.csv",index=False)

## Summary

I used Random Forest since I felt I was plateuing with KNeighrest Neighbors. I also hit a wall with random forest and could not break 18.9 . I think if there was something I could have improved it is how I do my feature selection. 