In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import nltk
import string

import re

pd.options.display.max_rows = 5
import nltk
from nltk.corpus import stopwords

In [35]:
def add_beer_class(df):
    remove = string.punctuation
    remove = remove.replace("-", "")
    pattern = "[{}]".format(remove)

    df.description = df.description.str.lower()
    df.description = df.description.str.replace(pattern,"")
    df.description = df.description.str.replace("\d","")
    
    df.name = df.name.str.lower()
    df.name = df.name.str.replace(pattern,"")


    beers = ["light","pale","hop","malt","light","imperial",
        "sweet","bitter","citrus","dark","ale",
             "ipa","double","wheat","beer","style",
            "lager","big","lambic","pilsner",
             "porter","stout","barleywine"]
    for beer_type in beers:
        text = "{0}[^ ]*".format(beer_type)
        df[beer_type] = df["description"].str.contains(text,
                                                       regex=True,
                                                       flags=re.IGNORECASE)
        df[beer_type] = df[beer_type].astype(int)
    bigrams = [("wheat", "beer"),("wheat", "ale"),("wheat", "ale"),
               ("pale", "ale"),("brown", "ale"),("double", "ipa")
              ,("noble", "hops"),("ipa", "brewed"),("west", "coast"),
               ("dry", "hopped"),("american","lager"),
              ("barley","wine"),("pale","lager"),("american","ipa"),
               ("imperial","ipa"),("imperial","stout")]
    
    for beer_type in bigrams:
        bi = set([beer_type])
        is_bi = bi.issubset
        text = "_".join(beer_type)
        df[text] = [is_bi(l) for l in df.bigrams.values.tolist()]
        df[text] = df[text].astype(int)
    tri = set([("india","pale", "ale")])
    is_tri = tri.issubset
    df["india_pale_ale"] = ([is_tri(l) for l in df.trigrams.
                             values.
                             tolist()])
    
    df["india_pale_ale"] = df["india_pale_ale"].astype(int)
    df["ipa_name"] = df["name"].str.contains("ipa")
    df["ipa_name"] = df["ipa_name"].astype(int)
    df["imperial_name"] = df["name"].str.contains("imperial")
    df["imperial_name"] = df["imperial_name"].astype(int)
    df["wheat_name"] = df["name"].str.contains("wheat")
    df["wheat_name"] = df["wheat_name"].astype(int)
    return df

def stepwise(df):
    MSE_per_col = pd.Series(index = df.columns)
    MSE_per_col.drop(["description","name","glass","bigrams","trigrams"],
                     inplace=True)
    for features in df.columns:
        if (features not in ["description","name","glass","bigrams","trigrams"]):
            X_dict = pd.DataFrame(df[features]).to_dict(orient="records")
            y = df[output]
            MSE_per_col[features] = (np.mean(
                -cross_val_score(pipeline, X_dict,
                                 y.values.ravel(),
                                 cv=10,
                                 scoring="neg_mean_squared_error")
            ))
    return MSE_per_col

def eval_models(df,features_list, cv = 10):
    MSE_per_col = pd.Series(index= ["model{0}".format(i) for i in range(1,len(features_list)+1)])
    counter = 1
    for features in features_list:
        X_dict = pd.DataFrame(df[features]).to_dict(orient="records")
        y = df["ibu"]
        MSE_per_col["model{0}".format(counter)] = (np.mean(
            -cross_val_score(pipeline,
                             X_dict,
                             y.values.ravel(),
                             cv = cv, scoring = "neg_mean_squared_error")
        ))
        counter += 1
    return MSE_per_col

def random_cv(df,model):   
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]
    scaler = QuantileTransformer()
    scaler.fit(train_features)
    train_features = scaler.transform(train_features)    

    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

    max_features = ['auto', 'log2','sqrt']

    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    min_samples_split = [2, 5, 10]

    min_samples_leaf = [1, 2, 4]

    bootstrap = [True, False]

    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    rf = RandomForestRegressor()
    rf_random = RandomizedSearchCV(estimator = rf,
                                   param_distributions = random_grid,
                                   n_iter = 100,
                                   cv = 3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs = -1)
    
    rf_random.fit(train_features,train_labels)
    return rf_random.best_params_

def gridSearch(param_grid,model):
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]
    scaler = QuantileTransformer()
    scaler.fit(train_features)
    train_features = scaler.transform(train_features)

    rf = RandomForestRegressor()

    grid_search = GridSearchCV(estimator = rf, 
                               param_grid = param_grid, 
                               cv = 5, 
                               n_jobs = -1, 
                               verbose = 2)
    grid_search.fit(train_features, train_labels)
    return grid_search.best_params_

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc,arg = None):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = ([token for token in tokens if 
                        token not in stop_words])
    # re-create document from filtered tokens
    if arg == "bigrams":
        return get_bigrams(filtered_tokens)
    if arg == "trigrams":
        return get_trigrams(filtered_tokens)
    doc = ' '.join(filtered_tokens)
    return doc

def get_trigrams(words):
    return list(zip(words[:-2], words[1:-1],words[2:]))

def get_bigrams(words):
    return list(zip(words[:-2], words[1:-1]))

def predict_spam(new_text,k=30):
    # Get the TF-IDF vector for the new text.
    x_new = vec.transform([new_text])[0, :]
    dot = x_new.multiply(X_train).sum(axis=1)
    x_new_len = np.sqrt(x_new.multiply(x_new).sum())
    denom = x_new_len * X_train_len
    cos_similarities = pd.DataFrame(dot / (denom))[0]
    classif = y_train[(cos_similarities.
                       sort_values(ascending=False)[:k].
                       index)].mean()
    return classif

def test_k_tf(feature,k,do_gram = False):
    if do_gram:
        vec = TfidfVectorizer(norm=False,
                              ngram_range=(2,2),
                              stop_words='english')
    else:
        vec = TfidfVectorizer(norm=False)
    y_train = df["ibu"]
    scaler = Normalizer()
    model = KNeighborsRegressor(n_neighbors=k,
                                metric='euclidean')
    pipeline = Pipeline([("vectorizer",vec),
                         ("scaler",scaler),
                         ("fit",model)])
    return(np.mean(
            -cross_val_score(pipeline,
                             df[feature],
                             y_train.values.ravel(), 
                             cv=5, scoring="neg_mean_squared_error")
        ))

def clean_data(df, *argv):
    df1 = df.copy()
    for series in argv:
        df1[series] = df1[series].str.lower()
        df1[series] = df1[series].str.replace("\n","")
        df1[series] = df1[series].str.replace("\r","")
        df1[series] = df1[series].str.replace("-"," ")
        df1[series] = df1[series].str.replace("[^\w\s]","")
        df1[series] = df1[series].str.strip()
    df1 = df1.fillna("None")
    df1 = df1.replace("", "missing")
    return df1

def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [36]:
home = pd.read_csv("crunchy_home.csv")

home["similar"] = home["similar"].apply(lambda x: x.split("::"))
home["tags"] = home["tags"].apply(lambda x: x.split("::"))
home["agg_rating"] = ((home["1"] + home["2"] * 2 + home["3"] * 3 + home["4"] * 4 + home["5"] * 5)/
                        home[["1","2","3","4","5"]].sum(axis = 1))

home["name"] = home["name"].str.replace("-"," ").str.lower()

reviews = pd.read_csv("crunchy_review.csv")

main = pd.read_csv("crunchy_main.csv")

main["similar"] = main["similar"].apply(lambda x: x.split("::"))
main["tags"] = main["tags"].apply(lambda x: x.split("::"))
main["name"] = main["name"].str.replace("-"," ").str.lower()
main["agg_rating"] = ((main["1"] + main["2"] * 2 + main["3"] * 3 + main["4"] * 4 + main["5"] * 5)/
                        main[["1","2","3","4","5"]].sum(axis = 1))

In [37]:
animelist = clean_data(pd.read_csv("/Volumes/SD_Card/myanimelist/AnimeList.csv"),"title", "title_english")
animelist = animelist[animelist["type"] == "TV"]

animelist["from"] = animelist["aired"].apply(lambda x : dict(eval(x))).apply(pd.Series)["from"]
animelist["to"] = animelist["aired"].apply(lambda x : dict(eval(x))).apply(pd.Series)["to"]

merge_anime  = main.merge(animelist[["title_english",
                                     "title","rating",
                                     "duration","from",
                                     "to","genre"]],how = "inner", left_on="name",
                          right_on="title_english")

In [124]:
merge_anime = merge_anime[merge_anime["duration"] != "Unknown"]

In [125]:
merge_anime_sample = merge_anime.sample(frac = 0.10)

In [126]:
training_data = explode(merge_anime_sample,
                        "similar").drop(["1","2","3","4","5","agg_review"],axis = 1)

In [127]:
training_data["datetime"] = training_data["datetime"]. \
                            str.replace("\n", ""). \
                            str.strip()

training_data["datetime"] = pd.to_datetime(training_data['datetime'],
                                           format='%b %d, %Y')

training_data["datetime"] = (pd.to_datetime(training_data['datetime']).
                                             apply(lambda date: date.toordinal()))

In [130]:
training_data["duration"] = training_data["duration"].str.split(" ").apply(lambda x : x[0]).astype("float64")

In [146]:
training_data["tags"] = training_data["tags"].apply(str)

In [148]:
training_data = clean_data(training_data, "desc", "review", "summary","tags")

In [149]:
word_training = training_data[["similar",
                               "desc",
                               "tags",
                               "review",
                               "summary",
                               "genre",
                               "agg_rating"]]

In [166]:
def stepwise(column):
    vec = TfidfVectorizer()
    scaler = Normalizer()
    model = KNeighborsRegressor(n_neighbors=30,metric="euclidean")
    pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])
    
    return (np.mean(-cross_val_score(pipeline, word_x_train[column], word_training["agg_rating"], 
                    cv=5,scoring = "neg_mean_squared_error")))

In [167]:
columns = word_x_train.columns
columns = pd.DataFrame(columns).rename(index =str, columns = {0:"columns"})

columns["MSE"] = columns["columns"].apply(stepwise)

In [None]:
columns

In [150]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline


transformer = FeatureUnion([
                ('similar_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['similar'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('desc_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['desc'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('tags_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['tags'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('review_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['review'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('summary_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['summary'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('genre_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['genre'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))])

In [151]:
word_x_train = word_training[["similar","desc","tags","review","summary","genre"]]

transformer.fit(word_x_train)

FeatureUnion(n_jobs=None,
       transformer_list=[('similar_tfidf', Pipeline(memory=None,
     steps=[('extract_field', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function <lambda> at 0x124be1a60>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate...      token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))]))],
       transformer_weights=None)

In [152]:
similar_vocab = transformer.transformer_list[0][1].steps[1][1].get_feature_names() 
desc_vocab = transformer.transformer_list[1][1].steps[1][1].get_feature_names()
tags_vocab = transformer.transformer_list[2][1].steps[1][1].get_feature_names() 
review_vocab = transformer.transformer_list[3][1].steps[1][1].get_feature_names()
summary_vocab = transformer.transformer_list[4][1].steps[1][1].get_feature_names() 
genre_vocab = transformer.transformer_list[5][1].steps[1][1].get_feature_names()

vocab = similar_vocab + desc_vocab + tags_vocab + review_vocab + summary_vocab + genre_vocab

In [153]:
scaler = Normalizer()
model = KNeighborsRegressor(n_neighbors=30,metric="euclidean")
pipeline = Pipeline([("vectorizer",transformer),("scaler",scaler),("fit",model)])

In [154]:
np.mean(-cross_val_score(pipeline, word_x_train, word_training["agg_rating"], 
                cv=5,scoring = "neg_mean_squared_error"))

0.043149746633677474