In [1]:
import pandas as pd

### Load dataset

In [5]:
anime_data = pd.read_csv('archive/anime.csv')
user_data = pd.read_csv('archive/rating_complete.csv', low_memory=False)

In [6]:
print(anime_data.shape)
anime_data.head()

(17562, 35)


Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [7]:
print(user_data.shape)
user_data.head()

(57633278, 3)


Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


### Data rearrangement

In [8]:
# rearrange anime_data via unique anime_id
anime_ids = anime_data['MAL_ID'].unique().tolist()
anime2_encoded = {x: i for i, x in enumerate(anime_ids)}
encoded2_anime = {i: x for i, x in enumerate(anime_ids)}
anime_data["anime"] = anime_data["MAL_ID"].map(anime2_encoded)

# rearrange user_data via unique user_id
user_ids = user_data["user_id"].unique().tolist()
user2_encoded = {x: i for i, x in enumerate(user_ids)}
encoded2_user = {i: x for i, x in enumerate(user_ids)}
user_data["user"] = user_data["user_id"].map(user2_encoded)

# merge two table sets
user_data = user_data.merge(anime_data,left_on="anime_id",right_on="MAL_ID",how='inner')
user_data=user_data[['user_id','anime_id','rating','user','anime']]
# user_data=user_data[(user_data["anime"] != "nan")]
user_data.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
0,0,430,9,0,403
1,6,430,8,6,403
2,18,430,10,17,403
3,19,430,8,18,403
4,33,430,4,31,403


In [10]:
print("The number of users:", len(user_ids))
print("The number of animes:", len(anime_ids))
print("The number of rated blocks:", len(user_data))

The number of users: 310059
The number of animes: 17562
The number of rated blocks: 57633278


In [22]:
# sort by user, anime
user_data = user_data.sort_values(['user','anime'], ascending = [True,True])
user_data.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
542393,0,68,6,0,49
270748,0,121,9,0,100
586737,0,164,8,0,142
825298,0,169,7,0,147
973222,0,174,4,0,151


In [23]:
# original user data
user_data_1 = user_data[['user','anime']]
user_data_1.insert(user_data_1.shape[1], 'click', 1)
user_data_1.head()

Unnamed: 0,user,anime,click
542393,0,49,1
270748,0,100,1
586737,0,142,1
825298,0,147,1
973222,0,151,1


In [24]:
density1 = len(user_data_1)/(len(anime_ids)*len(user_ids))
print("Density of original data:", density1)

Density of original data: 0.010584126132491855


In [25]:
# user data with high ratings (>7)
user_data_2 = user_data[(user_data['rating']>=7)]
user_data_2 = user_data_2[['user','anime']]
user_data_2.insert(user_data_2.shape[1], 'click', 1)
user_data_2.head()

Unnamed: 0,user,anime,click
270748,0,100,1
586737,0,142,1
825298,0,147,1
845691,0,176,1
368377,0,332,1


In [9]:
m = len(user_data_2)//20
print(m)
user_data = user_data_2.iloc[0:m]

1480667


In [3]:
anime_ids = user_data["anime"].unique().tolist()
user_ids = user_data["user"].unique().tolist()
print("The number of users:", len(user_ids))
print("The number of animes:", len(anime_ids))
print("The number of rated blocks:", len(user_data))

The number of users: 10371
The number of animes: 11503
The number of rated blocks: 1480667


In [10]:
density2 = len(user_data)/(len(anime_ids)*len(user_ids))
print("Density of final data:", density2)

Density of final data: 0.012411539198189993


### Split and save

In [None]:
train = pd.DataFrame(columns=['user', 'anime', 'click'])
test = pd.DataFrame(columns=['user', 'anime', 'click'])

for id in user_ids:
    df = user_data[(user_data['user']==id)]
    train_len = int(len(df)*0.8)
    new_train = df.iloc[0:train_len]
    new_test = df.iloc[train_len:]
    train = train.append(new_train, ignore_index = True)
    test = test.append(new_test, ignore_index = True)

train.to_csv("./anime/train.txt", header=None, index=None, sep=' ', mode='w')
test.to_csv("./anime/test.txt", header=None, index=None, sep=' ', mode='w')