In [17]:
import pandas as pd
import numpy as np
# import xgboost as xgb
from sklearn import preprocessing, cross_validation, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, \
  RandomForestRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "rating"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5


In [18]:
# Anime.csv

# anime_id - myanimelist.net's unique id identifying an anime.
# name - full name of anime.
# genre - comma separated list of genres for this anime.
# type - movie, TV, OVA, etc.
# rating - average rating out of 10 for this anime.
# members - number of community members that are in this anime's "group".


# Rating.csv

# user_id - non identifiable randomly generated user id.
# anime_id - the anime that this user has rated.
# rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).


In [19]:
animes = pd.read_csv(DIR_DATA + '/anime.csv')
ratings = pd.read_csv(DIR_DATA + '/rating.csv')

label = animes.rating
labeled_ratings = ratings[ratings.rating != -1][:1000]

# TODO: Get things with similar names classified together


In [20]:
animes.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [21]:
ratings.tail()

Unnamed: 0,user_id,anime_id,rating
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9
7813736,73516,8074,9


In [22]:
# TODO: Get user_id, previous rankings

In [23]:
def predict_naive(ratings, animes):
  prediction = []
  for anime_id in ratings.anime_id:
    prediction.append(round(animes[animes.anime_id == anime_id].rating.item()))
  return prediction

In [28]:
predictions = predict_naive(ratings,animes)

ValueError: can only convert an array of size 1 to a Python scalar

In [25]:
animes[animes.anime_id == 8074].rating

1709    7.46
Name: rating, dtype: float64

In [None]:
print(mean_absolute_error(predictions, ratings.rating.tolist()))


In [11]:
animes.type = animes.type.astype('category')


In [12]:
animes["genre_list"] = animes["genre"].apply(lambda x: str(x).split(','))

In [13]:
# enc = preprocessing.LabelBinarizer()
# # print(animes.type.cat.codes)
# enc.fit_transform(animes.type.cat.codes)

animes = pd.concat([animes,pd.get_dummies(animes.type)], axis=1).drop(["type"],1)

In [14]:
# pd.get_dummies(animes.genre_list)

In [15]:
g_list = animes.genre_list.tolist()
g_set = set()
for g in g_list:
    for x in g:
      g_str = x.strip()
      if g_str == "Music":
        # Music is also a type, avoid two columns with same name
        g_str = "Music_Genre"
      g_set.add(g_str)
#         print(type(x))



In [16]:
print(sorted(list(g_set)))

['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music_Genre', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri', 'nan']


In [17]:
hot_labels_genres = pd.get_dummies(list(g_set))
def hot_label_sum(genres, hot_labels_genres = hot_labels_genres):
    hot_labels = np.zeros(len(hot_labels_genres["Action"]))
    
    for g in genres:
        g_str = g.strip()
        if g_str == "Music":
          # Music is also a type, avoid two columns with same name
          g_str = "Music_Genre"
        hot_labels += hot_labels_genres[g_str]
    return hot_labels


In [18]:
animes["genre_hot_labels"] = animes.genre_list.apply(lambda x: hot_label_sum(x))

In [19]:
g_df = pd.DataFrame(animes.genre_hot_labels.tolist(),columns=hot_labels_genres.keys())

In [20]:
animes = pd.concat([animes,g_df], axis=1).drop(["genre","genre_list", "genre_hot_labels"],1)

In [21]:
animes.dtypes

anime_id           int64
name              object
episodes          object
rating           float64
members            int64
Movie              uint8
Music              uint8
ONA                uint8
OVA                uint8
Special            uint8
TV                 uint8
Action           float64
Adventure        float64
Cars             float64
Comedy           float64
Dementia         float64
Demons           float64
Drama            float64
Ecchi            float64
Fantasy          float64
Game             float64
Harem            float64
Hentai           float64
Historical       float64
Horror           float64
Josei            float64
Kids             float64
Magic            float64
Martial Arts     float64
Mecha            float64
Military         float64
Music_Genre      float64
Mystery          float64
Parody           float64
Police           float64
Psychological    float64
Romance          float64
Samurai          float64
School           float64
Sci-Fi           float64


In [22]:
# TODO: Episodes
animes.episode = animes.episodes.apply(lambda x: int(x) if x.strip() != "Unknown" else -1)

animes = animes.dropna()

animes["int_rating"] = animes.rating.apply(lambda x: round(x))

In [33]:
# Predict ratings within animes
def predict_on_animes_int_rating(animes,clf, test_size = 0.2):
    X = animes.drop(['anime_id','name','rating',"episodes", "int_rating"],1)
    y = animes.int_rating  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    clf.fit(X_train, y_train)
    print('Training Error: {:.3f}'.format(1 - clf.score(X_train, 
                                                          y_train)))
    print('Validation Error: {:.3f}'.format(1 - clf.score(X_test, 
                                                            y_test)))

In [34]:
classifiers = [RandomForestRegressor(), GradientBoostingRegressor(), AdaBoostRegressor(),
              xgb.XGBRegressor()]
classifiers = [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier(),
              xgb.XGBClassifier()]
for clf in classifiers:
    predict_on_animes_int_rating(animes,clf)

Training Error: 0.042
Validation Error: 0.466
Training Error: 0.368
Validation Error: 0.431
Training Error: 0.523
Validation Error: 0.520
Training Error: 0.408
Validation Error: 0.433


In [29]:
clf = xgb.XGBRegressor()
X = animes.drop(['anime_id','name','rating',"episodes", "int_rating"],1)
y = animes.rating
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
clf.fit(X_train, y_train)
print('Training Error: {:.3f}'.format(1 - clf.score(X_train, 
                                                          y_train)))
print('Validation Error: {:.3f}'.format(1 - clf.score(X_test, 
                                                            y_test)))


Training Error: 0.408
Validation Error: 0.422


In [26]:
print(len(list(animes.columns)), len(set(list(animes.columns))))
a_list = list(animes.columns)
for i in set(list(animes.columns)):1
  a_list.remove(i)
print(a_list)
     
    

56 56
[]


In [27]:
clf = xgb.XGBRegressor()
%timeit clf.fit(X_train, y_train)

1 loop, best of 3: 153 ms per loop


In [28]:
%timeit GradientBoostingRegressor().fit(X_train, y_train)

1 loop, best of 3: 760 ms per loop


In [36]:
animes.head()

Unnamed: 0,anime_id,name,episodes,rating,members,Movie,Music,ONA,OVA,Special,...,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,nan,int_rating
0,32281,Kimi no Na wa.,1,9.37,200630,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
1,5114,Fullmetal Alchemist: Brotherhood,64,9.26,793665,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,28977,Gintama°,51,9.25,114262,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9
3,9253,Steins;Gate,24,9.17,673572,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,9969,Gintama&#039;,51,9.16,151266,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9


In [38]:
with open(DIR_PROCESSED + '/one_hot_encoded_anime.pickle','wb') as a_file:
  pickle.dump(animes, a_file)

TypeError: file must have a 'write' attribute

In [13]:
x = list(range(100))
y = np.arange(100)
def std_lib(x):
  return sum([i ** 2 for i in x])

def np_lib(x):
  return np.dot(x,x)





In [14]:
%timeit np_lib(y)

The slowest run took 18.46 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 716 ns per loop


In [15]:
%timeit std_lib(x)

10000 loops, best of 3: 22 µs per loop


In [11]:
print(np_lib(y), std_lib(x))

332833500 332833500


In [16]:
np_lib(y)

328350

In [17]:
std_lib(x)

328350

In [36]:
x = list(range(5))

In [35]:
print(x)

[0, 1, 2, 3, 4]


In [38]:
arr = [i**2 for i in x if i != 3]

[0, 1, 4, 16]

In [42]:
arr = []
for i in x:
    if i != 3:
        arr.append(i**2)
print(arr)

[0, 1, 4, 16]
