In [None]:
#time to cluster my data into groups to find out if anything meaningful can come from it.
import requests
import re
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import nltk
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
#load beer data set
beer_train = pd.read_csv('beer_train.csv')


In [None]:
beer_train.rename(columns = {'id':'beer_id'}, inplace = True)

In [None]:
from sklearn.cluster import KMeans

In [None]:
#set the features to create clusters from
X = beer_train[beer_train['loc_country_code'] == 840][['ibu_x',
                                                       'abv_x',
                                                       'originalGravity_x',
                                                       'srmId_x',
                                                       'hop_alphaacid',
                                                       'hop_betaacid',
                                                       'hop_caryophyllene',
                                                       'hop_myrcene',
                                                       'hop_humulene',
                                                       'hop_cohumulone',
                                                       'hop_geraniol',
                                                       'hop_totaloil'
                                                      ]]

In [None]:
#test to see how many clusters will be best based on silhouette score
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
k_range = range(2, 20)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k)
    km.fit(X_scaled)
    scores.append(metrics.silhouette_score(X_scaled, km.labels_))

In [None]:
#plot results to see best cluster number visually
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)

In [None]:
#create clusters only for beers from the US(country code 840)
beers_US_hops = beer_train[beer_train['loc_country_code']==840]
scaler = StandardScaler()
X4_scaled = scaler.fit_transform(X4)
km = KMeans(n_clusters=12, random_state=1)
km.fit(X4_scaled)
beers_US_hops['cluster'] = km.labels_
beers_US_hops.groupby('cluster').mean()

In [None]:
#save the results
beers_US_hops.to_csv('beers_US_hops.csv')

In [None]:
#explore the clusters
beers_US_hops.groupby('cluster').style_name_x.value_counts()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 0][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 1][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 2][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 3][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 4][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 5][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 6][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 7][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 8][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 9][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 10][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
beers_US_hops[beers_US_hops['cluster'] == 11][['abv_x','ibu_x','originalGravity_x','srmId_x']].describe()

In [None]:
#I now want to see which hops should be used to create a beer with my desired characteristics. 
#To do that I will fit a prediciton model to output a target cluster nubmer and then check the most common hops for that cluster.
X,y = beers_US_hops[['abv_x','ibu_x','originalGravity_x','styleId_x','srmId_x']],beers_US_hops['cluster']

print X.shape
print y.shape

In [None]:
#Test accuracy of KNN.  This wasn't very good.
scores = []
for i in range(1, 30):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X, y)
    scores.append(knn.score(X, y))
    
scores

In [None]:
#test accuacry of a RandomForestClassifier.  This was a much better result.
scores = []
for i in range(1, 100):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X,y)
    scores.append(rf.score(X, y))
    
scores

In [None]:
plt.plot(range(1,100), scores)
plt.ylabel("Accuracy")
plt.xlabel("n_estimators")

In [None]:
#maybe I can do better by using many models at the same time and taking the best restult.  I will do this with a VotingClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
models = [('rf', RandomForestClassifier(n_estimators=15)),
          ('decision', DecisionTreeClassifier(max_depth=5, max_features = 4)),
          ('NB', MultinomialNB()),
          ('log', LogisticRegression())
         ]

In [None]:
vc = VotingClassifier(models)
params = {'voting':['soft', 'hard']}
grid = GridSearchCV(vc, params, cv=5, scoring='accuracy')
grid.fit(X, y)

In [None]:
#this model did better so I will use it.
print grid.best_score_
print grid.best_params_

In [None]:
#Fit the model to my data and create a variable of the predicted cluster.
new_test2 = [7,60,1.080,127,15]
model1 = RandomForestClassifier(n_estimators=16)
model2 = DecisionTreeClassifier(max_depth=4, max_features = 4)
model3 = MultinomialNB()
model4 = LogisticRegression()
vc = VotingClassifier(estimators = [('rf',model1),('dt', model2),('nb',model3),('log',model4)], voting = 'soft')
vc.fit(X,y)
probabs = vc.predict(new_test2)
cluster_ing = np.where(probabs == probabs.max())
print cluster_ing
print int(probabs)

In [None]:
#take the predicted cluster and show the most common hops used in that cluster.
print beers_US_hops.groupby(beers_US_hops['cluster'] == int(probabs)).hop_names.value_counts()[1].head()