In [38]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "rating"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5


In [39]:
# Anime.csv

# anime_id - myanimelist.net's unique id identifying an anime.
# name - full name of anime.
# genre - comma separated list of genres for this anime.
# type - movie, TV, OVA, etc.
# rating - average rating out of 10 for this anime.
# members - number of community members that are in this anime's "group".


# Rating.csv

# user_id - non identifiable randomly generated user id.
# anime_id - the anime that this user has rated.
# rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).


In [40]:
animes = pd.read_csv(DIR_DATA + '/anime.csv')
ratings = pd.read_csv(DIR_DATA + '/rating.csv')

label = animes.rating

# TODO: Turn type, genre into class
# TODO: Get things with similar names classified together

labeled_ratings = ratings[ratings["rating"] != -1][:1000]


In [41]:
animes.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [42]:
ratings.tail()

Unnamed: 0,user_id,anime_id,rating
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9
7813736,73516,8074,9


In [43]:
# TODO: Get user_id, previous rankings

In [44]:
def predict_naive(ratings, animes):
  prediction = []
  for anime_id in ratings[ratings["rating"] != -1].anime_id:
    prediction.append(round(animes[animes.anime_id == anime_id].rating.item()))
  return prediction

In [45]:
predictions = predict_naive(labeled_ratings,animes)

In [46]:
animes[animes.anime_id == 8074].rating

1709    7.46
Name: rating, dtype: float64

In [47]:
print(mean_squared_error(predictions, labeled_ratings.rating.tolist()))


7.518


In [48]:
animes.type = animes.type.astype('category')


In [49]:
animes["genre_list"] = animes["genre"].apply(lambda x: str(x).split(','))

In [75]:
# enc = preprocessing.LabelBinarizer()
# # print(animes.type.cat.codes)
# enc.fit_transform(animes.type.cat.codes)

animes = pd.concat([animes,pd.get_dummies(animes.type)], axis=1).drop(["type"],1)

In [77]:
pd.get_dummies(animes.genre_list)

TypeError: unhashable type: 'list'

In [13]:
g_list = animes.genre_list.tolist()
g_set = set()
for g in g_list:
    for x in g:
        g_set.add(x.strip())
#         print(type(x))



In [15]:
hot_labels_genres = pd.get_dummies(list(g_set))
def hot_label_sum(genres, hot_labels_genres = hot_labels_genres):
    hot_labels = np.zeros(len(hot_labels_genres["Action"]))
    for g in genres:
        hot_labels += hot_labels_genres[g.strip()]
    return hot_labels


In [16]:
animes["genre_hot_labels"] = animes.genre_list.apply(lambda x: hot_label_sum(x))

In [17]:
print(hot_labels_genres.keys())

Index(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
       'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror',
       'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music',
       'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai',
       'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen',
       'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power',
       'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri', 'nan'],
      dtype='object')


In [120]:
list(g_set)

['Thriller',
 'Psychological',
 'Harem',
 'Fantasy',
 'Horror',
 'Shounen',
 'Ecchi',
 'Shoujo',
 'Demons',
 'Yaoi',
 'Romance',
 'Police',
 'Shoujo Ai',
 'Kids',
 'School',
 'Parody',
 'Mecha',
 'Yuri',
 'Military',
 'Samurai',
 'Cars',
 'Adventure',
 'Mystery',
 'Game',
 'Magic',
 'Drama',
 'Sports',
 'Action',
 'Dementia',
 'Comedy',
 'Sci-Fi',
 'Hentai',
 'nan',
 'Shounen Ai',
 'Josei',
 'Supernatural',
 'Space',
 'Music',
 'Martial Arts',
 'Historical',
 'Seinen',
 'Slice of Life',
 'Vampire',
 'Super Power']

In [21]:
animes = animes.drop(["genre","genre_list"],1)
animes.head()

Unnamed: 0,anime_id,name,type,episodes,rating,members,genre_hot_labels
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, ..."
2,28977,Gintama°,TV,51,9.25,114262,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,9253,Steins;Gate,TV,24,9.17,673572,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,9969,Gintama&#039;,TV,51,9.16,151266,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [34]:
# Predict ratings within animes
def predict_on_animes(animes, test_size = 0.2):
    X = animes.drop(['anime_id','name','rating'],1)
    y = animes["rating"]
    X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=test_size)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

In [35]:
print(predict_on_animes(animes))

ValueError: setting an array element with a sequence.