In [111]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "rating"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5


In [53]:
# Anime.csv

# anime_id - myanimelist.net's unique id identifying an anime.
# name - full name of anime.
# genre - comma separated list of genres for this anime.
# type - movie, TV, OVA, etc.
# rating - average rating out of 10 for this anime.
# members - number of community members that are in this anime's "group".


# Rating.csv

# user_id - non identifiable randomly generated user id.
# anime_id - the anime that this user has rated.
# rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).


In [54]:
animes = pd.read_csv(DIR_DATA + '/anime.csv')
ratings = pd.read_csv(DIR_DATA + '/rating.csv')

label = animes.rating

# TODO: Turn type, genre into class
# TODO: Get things with similar names classified together

labeled_ratings = ratings[ratings["rating"] != -1][:1000]


In [55]:
animes.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [56]:
ratings.tail()

Unnamed: 0,user_id,anime_id,rating
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9
7813736,73516,8074,9


In [57]:
# TODO: Get user_id, previous rankings

In [58]:
def predict_naive(ratings, animes):
  prediction = []
  for anime_id in ratings[ratings["rating"] != -1].anime_id:
    prediction.append(round(animes[animes.anime_id == anime_id].rating.item()))
  return prediction

In [59]:
predictions = predict_naive(labeled_ratings,animes)

In [60]:
animes[animes.anime_id == 8074].rating

1709    7.46
Name: rating, dtype: float64

In [61]:
print(mean_squared_error(predictions, labeled_ratings.rating.tolist()))


7.518


In [62]:
animes.type = animes.type.astype('category')


In [63]:
animes["genre_list"] = animes["genre"].apply(lambda x: str(x).split(','))

In [64]:
# enc = preprocessing.LabelBinarizer()
# # print(animes.type.cat.codes)
# enc.fit_transform(animes.type.cat.codes)

animes = pd.concat([animes,pd.get_dummies(animes.type)], axis=1).drop(["type"],1)

In [65]:
# pd.get_dummies(animes.genre_list)

In [66]:
g_list = animes.genre_list.tolist()
g_set = set()
for g in g_list:
    for x in g:
        g_set.add(x.strip())
#         print(type(x))



In [67]:
hot_labels_genres = pd.get_dummies(list(g_set))
def hot_label_sum(genres, hot_labels_genres = hot_labels_genres):
    hot_labels = np.zeros(len(hot_labels_genres["Action"]))
    for g in genres:
        hot_labels += hot_labels_genres[g.strip()]
    return hot_labels


In [68]:
animes["genre_hot_labels"] = animes.genre_list.apply(lambda x: hot_label_sum(x))

In [69]:
g_df = pd.DataFrame(animes.genre_hot_labels.tolist(),columns=hot_labels_genres.keys())

In [70]:
animes = pd.concat([animes,g_df], axis=1).drop(["genre","genre_list", "genre_hot_labels"],1)

In [105]:
animes.dtypes

anime_id           int64
name              object
episodes          object
rating           float64
members            int64
Movie              uint8
Music              uint8
ONA                uint8
OVA                uint8
Special            uint8
TV                 uint8
Action           float64
Adventure        float64
Cars             float64
Comedy           float64
Dementia         float64
Demons           float64
Drama            float64
Ecchi            float64
Fantasy          float64
Game             float64
Harem            float64
Hentai           float64
Historical       float64
Horror           float64
Josei            float64
Kids             float64
Magic            float64
Martial Arts     float64
Mecha            float64
Military         float64
Music            float64
Mystery          float64
Parody           float64
Police           float64
Psychological    float64
Romance          float64
Samurai          float64
School           float64
Sci-Fi           float64


In [118]:
# TODO: Episodes
animes.episode = animes.episodes.apply(lambda x: int(x) if x.strip() != "Unknown" else -1)

animes = animes.dropna()

animes.int_rating = animes.rating.apply(lambda x: round(x))

In [125]:
# Predict ratings within animes
def predict_on_animes_int_rating(animes,clf, test_size = 0.2):
    X = animes.drop(['anime_id','name','rating',"episodes", "int_rating"],1)
    y = animes.int_rating
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

In [132]:
classifiers = [RandomForestClassifier()]*5
for clf in classifiers:
    print(predict_on_animes_int_rating(animes,clf))

0.517612929963
0.538334024036
0.54206382097
0.531703273933
0.529216742644
