In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import nltk
import string

import re

pd.options.display.max_rows = 5
import nltk
from nltk.corpus import stopwords

In [35]:
def random_cv(df,model):   
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]
    scaler = QuantileTransformer()
    scaler.fit(train_features)
    train_features = scaler.transform(train_features)    

    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

    max_features = ['auto', 'log2','sqrt']

    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    min_samples_split = [2, 5, 10]

    min_samples_leaf = [1, 2, 4]

    bootstrap = [True, False]

    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    rf = RandomForestRegressor()
    rf_random = RandomizedSearchCV(estimator = rf,
                                   param_distributions = random_grid,
                                   n_iter = 100,
                                   cv = 3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs = -1)
    
    rf_random.fit(train_features,train_labels)
    return rf_random.best_params_

def gridSearch(param_grid,model):
    vec = DictVectorizer(sparse=False)
    vec.fit(df[model].to_dict(orient="records"))
    train_features = vec.transform(df[model].to_dict(orient="records"))
    train_labels = df["ibu"]
    scaler = QuantileTransformer()
    scaler.fit(train_features)
    train_features = scaler.transform(train_features)

    rf = RandomForestRegressor()

    grid_search = GridSearchCV(estimator = rf, 
                               param_grid = param_grid, 
                               cv = 5, 
                               n_jobs = -1, 
                               verbose = 2)
    grid_search.fit(train_features, train_labels)
    return grid_search.best_params_

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def clean_data(df, *argv):
    df1 = df.copy()
    for series in argv:
        df1[series] = df1[series].str.lower()
        df1[series] = df1[series].str.replace("\n","")
        df1[series] = df1[series].str.replace("\r","")
        df1[series] = df1[series].str.replace("-"," ")
        df1[series] = df1[series].str.replace("[^\w\s]","")
        df1[series] = df1[series].str.strip()
    df1 = df1.fillna("None")
    df1 = df1.replace("", "missing")
    return df1

def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [36]:
home = pd.read_csv("crunchy_home.csv")

home["similar"] = home["similar"].apply(lambda x: x.split("::"))
home["tags"] = home["tags"].apply(lambda x: x.split("::"))
home["agg_rating"] = ((home["1"] + home["2"] * 2 + home["3"] * 3 + home["4"] * 4 + home["5"] * 5)/
                        home[["1","2","3","4","5"]].sum(axis = 1))

home["name"] = home["name"].str.replace("-"," ").str.lower()

reviews = pd.read_csv("crunchy_review.csv")

main = pd.read_csv("crunchy_main.csv")

main["similar"] = main["similar"].apply(lambda x: x.split("::"))
main["tags"] = main["tags"].apply(lambda x: x.split("::"))
main["name"] = main["name"].str.replace("-"," ").str.lower()
main["agg_rating"] = ((main["1"] + main["2"] * 2 + main["3"] * 3 + main["4"] * 4 + main["5"] * 5)/
                        main[["1","2","3","4","5"]].sum(axis = 1))

In [37]:
animelist = clean_data(pd.read_csv("/Volumes/SD_Card/myanimelist/AnimeList.csv"),"title", "title_english")
animelist = animelist[animelist["type"] == "TV"]

animelist["from"] = animelist["aired"].apply(lambda x : dict(eval(x))).apply(pd.Series)["from"]
animelist["to"] = animelist["aired"].apply(lambda x : dict(eval(x))).apply(pd.Series)["to"]

merge_anime  = main.merge(animelist[["title_english",
                                     "title","rating",
                                     "duration","from",
                                     "to","genre"]],how = "inner", left_on="name",
                          right_on="title_english")

In [124]:
merge_anime = merge_anime[merge_anime["duration"] != "Unknown"]

In [125]:
merge_anime_sample = merge_anime.sample(frac = 0.10)

In [126]:
training_data = explode(merge_anime_sample,
                        "similar").drop(["1","2","3","4","5","agg_review"],axis = 1)

In [127]:
training_data["datetime"] = training_data["datetime"]. \
                            str.replace("\n", ""). \
                            str.strip()

training_data["datetime"] = pd.to_datetime(training_data['datetime'],
                                           format='%b %d, %Y')

training_data["datetime"] = (pd.to_datetime(training_data['datetime']).
                                             apply(lambda date: date.toordinal()))

In [130]:
training_data["duration"] = training_data["duration"].str.split(" ").apply(lambda x : x[0]).astype("float64")

In [146]:
training_data["tags"] = training_data["tags"].apply(str)

In [148]:
training_data = clean_data(training_data, "desc", "review", "summary","tags")

## Text Data

### Stepwise Analysis

In [149]:
word_training = training_data[["similar",
                               "desc",
                               "tags",
                               "review",
                               "summary",
                               "genre",
                               "agg_rating"]]

In [166]:
def stepwise(column):
    vec = TfidfVectorizer()
    scaler = Normalizer()
    model = KNeighborsRegressor(n_neighbors=30,metric="euclidean")
    pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])
    
    return (np.mean(-cross_val_score(pipeline, word_x_train[column], word_training["agg_rating"], 
                    cv=5,scoring = "neg_mean_squared_error")))

In [167]:
columns = word_x_train.columns
columns = pd.DataFrame(columns).rename(index =str, columns = {0:"columns"})

columns["MSE"] = columns["columns"].apply(stepwise)

In [171]:
columns.sort_values("MSE").head()

Unnamed: 0,columns,MSE
1,desc,0.055498
2,tags,0.056205
5,genre,0.056614
0,similar,0.210392
3,review,0.247045


__Conclusion:__

We can see that Description, Tags, Genre give the best performance, while review and similar are not.

### Feature Union

We want to include more than just one text column for our model, to do this we will need to apply a seperate TF-IDF vectorizer to each column. To be able to do this we need to use a Feature Union.

In [177]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline


transformer = FeatureUnion([
                ('similar_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['similar'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('desc_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['desc'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('tags_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['tags'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('review_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['review'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('summary_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['summary'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('genre_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['genre'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))])

In [178]:
word_x_train = word_training[["similar","desc","tags","review","summary","genre"]]

scaler = Normalizer()
model = KNeighborsRegressor(n_neighbors=30,metric="euclidean")
pipeline = Pipeline([("vectorizer",transformer),("scaler",scaler),("fit",model)])

np.mean(-cross_val_score(pipeline, word_x_train, word_training["agg_rating"], 
                cv=5,scoring = "neg_mean_squared_error"))

0.043149746633677474

Using all our text columns gave us a MSE of $0.043$. That is better than any one of single text columns MSE's.

However we say that some text columns were lacking, specifically "similar" and "reviews". Let's try dropping them and seeing if our model improves in performance 

In [179]:
transformer = FeatureUnion([
                ('desc_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['desc'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('tags_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['tags'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('genre_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['genre'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))])

In [180]:
word_x_train = word_training[["desc","tags","genre"]]

scaler = Normalizer()
model = KNeighborsRegressor(n_neighbors=30,metric="euclidean")
pipeline = Pipeline([("vectorizer",transformer),("scaler",scaler),("fit",model)])

np.mean(-cross_val_score(pipeline, word_x_train, word_training["agg_rating"], 
                cv=5,scoring = "neg_mean_squared_error"))

0.04511762190509126

It looks like that dropping those two columns did not help our MSE go down, rather it went up to an MSE of about $0.04511$

### Evaluating different $K$ values

Let's go back to our first FeatureUnion model since we know it worked better

In [181]:
transformer = FeatureUnion([
                ('similar_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['similar'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('desc_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['desc'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('tags_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['tags'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('review_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['review'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('summary_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['summary'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('genre_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['genre'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))])

In [182]:
word_x_train = word_training[["similar","desc","tags","review","summary","genre"]]

scaler = Normalizer()
model = KNeighborsRegressor(n_neighbors=30,metric="euclidean")
pipeline = Pipeline([("vectorizer",transformer),("scaler",scaler),("fit",model)])

Let's evaluate what amount of neighbors for the KNeighborsRegressor is optimal. Do better organize this let's create a dataframe of $K$ values we want to test.

We will increment in 10's in order to save runtime, and focus in on the mininium that should appear.

In [193]:
ks = pd.Series(range(5,65,10))
ks.index = range(5,65,10)
ks = pd.DataFrame(ks).rename(index= str, columns = {0:"K"})

In [194]:
def test_k_vals(k):
    scaler = Normalizer()
    model = KNeighborsRegressor(n_neighbors=k,metric="euclidean")
    pipeline = Pipeline([("vectorizer",transformer),("scaler",scaler),("fit",model)])

    return np.mean(-cross_val_score(pipeline, word_x_train, word_training["agg_rating"], 
                cv=8,scoring = "neg_mean_squared_error"))

In [None]:
ks["MSE"] = ks["K"].apply(test_k_vals)

In [None]:
ks