<h1>Explore anime dataset from Kaggle<h1>

57M ratings on over 17k anime

In [134]:
import numpy as np
import pandas as pd
import pickle

In [55]:
df_items = pd.read_csv("/home/diego/chat-reranking/dataset/anime/anime.csv")
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [40]:
df_ratings = pd.read_csv("/home/diego/chat-reranking/dataset/anime/rating_complete.csv")
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


<h3>Filtering the dataset<h3>

Items:
- Remove duplicates
- Remove items where "English name" is unknown
- Remove recent items (older than 2020). This is because language models aren't updated with fresh info (chatgpt up to Sept 2021)
- Remove items with unknown premiered

Ratings:
- Remove users with less than 70 and more than 300 ratings
- Remove users with less than 30 positive ratings
- Map ratings into 1-5 stars

In [56]:
df_items.sort_values(by="Popularity", ascending=True, inplace=True)
df_items.drop_duplicates(subset=["English name"], keep="first", inplace=True)
len(df_items)

6831

In [57]:
df_items = df_items[df_items["English name"] != "Unknown"]
len(df_items)

6830

In [58]:
df_items = df_items[df_items["Premiered"] != "Unknown"]
len(df_items)

2762

In [59]:
df_items["prem_year"] = df_items.apply(lambda x:int(x["Premiered"][-4:]), axis=1)
df_items.head(1)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,prem_year
1393,1535,Death Note,8.63,"Mystery, Police, Psychological, Supernatural, ...",Death Note,デスノート,TV,37,"Oct 4, 2006 to Jun 27, 2007",Fall 2006,...,535252.0,415890.0,201522.0,68577.0,28048.0,10462.0,3692.0,2256.0,3586.0,2006


In [60]:
df_items = df_items[df_items["prem_year"] < 2021]
len(df_items)

2711

In [61]:
df_items = df_items[["MAL_ID", "English name", "Popularity", "Genres", "prem_year"]]
df_items.columns

Index(['MAL_ID', 'English name', 'Popularity', 'Genres', 'prem_year'], dtype='object')

In [62]:
df_items = df_items.rename(columns={"MAL_ID": "item_id", "English name": "item_name", "Popularity": "pop_score"})
df_items.columns

Index(['item_id', 'item_name', 'pop_score', 'Genres', 'prem_year'], dtype='object')

In [63]:
df_items["genres"] = df_items.apply(lambda x: x["Genres"].split(", "), axis=1)
del df_items["Genres"]
df_items.head(1)

Unnamed: 0,item_id,item_name,pop_score,prem_year,genres
1393,1535,Death Note,1,2006,"[Mystery, Police, Psychological, Supernatural,..."


In [64]:
len(df_items["item_id"].unique())

2711

In [105]:
out_dir = "/home/diego/chat-reranking/experiments/anime/"

In [106]:
df_items.to_csv(f"{out_dir}df_items.csv", sep=",", index=False)

In [107]:
itemid_to_name = {}
for i, n in zip(df_items["item_id"].values, df_items["item_name"].values):
    itemid_to_name[i] = n
itemname_to_id = {v: k for k, v in itemid_to_name.items()}
with open(f"{out_dir}itemid_to_name.pkl", 'wb') as fp:
    pickle.dump(itemid_to_name, fp)
with open(f"{out_dir}itemname_to_id.pkl", 'wb') as fp:
    pickle.dump(itemname_to_id, fp)

In [108]:
# prepare genres file
out_string = ""
for i, row in df_items.iterrows():
    for genre in row["genres"]:
        if row['item_name'] in itemname_to_id:
            out_string += f"{itemname_to_id[row['item_name']]}\t{genre}\n"
        else:
            print(row)

with open(f"{out_dir}genres_file.txt", "w") as text_file:
    text_file.write(out_string)

In [65]:
genres = set()
for g in df_items["genres"].values:
    genres = genres.union(g)
print(len(genres))
genres

40


{'Action',
 'Adventure',
 'Cars',
 'Comedy',
 'Dementia',
 'Demons',
 'Drama',
 'Ecchi',
 'Fantasy',
 'Game',
 'Harem',
 'Historical',
 'Horror',
 'Josei',
 'Kids',
 'Magic',
 'Martial Arts',
 'Mecha',
 'Military',
 'Music',
 'Mystery',
 'Parody',
 'Police',
 'Psychological',
 'Romance',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shoujo Ai',
 'Shounen',
 'Shounen Ai',
 'Slice of Life',
 'Space',
 'Sports',
 'Super Power',
 'Supernatural',
 'Thriller',
 'Vampire'}

In [122]:
itemid_to_namegenres = {}
itemnamegenres_to_id = {}
for i, row in df_items.iterrows():
    item_id = row["item_id"]
    genres_str = ""
    item_name = row["item_name"]
    for g in row["genres"]:
        genres_str += f"{g}, "
    itemid_to_namegenres[item_id] = f"{item_name} ({genres_str[:-2]})"
    
    # to convert item names to id, the dict accepts both names only or names with genres
    itemnamegenres_to_id[f"{item_name} ({genres_str[:-2]})"] = item_id
    itemnamegenres_to_id[item_name] = item_id

In [123]:
out_dir = "/home/diego/chat-reranking/experiments/anime/"
with open(f"{out_dir}itemid_to_namegenres.pkl", 'wb') as fp:
    pickle.dump(itemid_to_namegenres, fp)
with open(f"{out_dir}itemnamegenres_to_id.pkl", 'wb') as fp:
    pickle.dump(itemnamegenres_to_id, fp)

In [125]:
with open('/home/diego/chat-reranking/experiments/anime/itemnamegenres_to_id.pkl', 'rb') as f:
    x = pickle.load(f)
x["Attack on Titan Season 3 (Action, Military, Mystery, Super Power, Drama, Fantasy, Shounen)"]

35760

In [112]:
df_ratings.rename(columns={"anime_id": "item_id"}, inplace=True)

In [56]:
df_ratings = df_ratings[df_ratings["item_id"].isin(df_items["item_id"].values)]
len(df_ratings)

35080540

In [57]:
upper_bound = 300
lower_bound = 70
to_keep = df_ratings['user_id'].value_counts()[lambda x: (x>lower_bound) & (x<upper_bound)].index.to_list()

In [58]:
df_ratings = df_ratings[df_ratings["user_id"].isin(to_keep)]
len(df_ratings)

20093374

In [59]:
ratings_mapper = {1: 1, 2:1, 3: 2, 4:2, 5: 3, 6:3, 7: 4, 8:1, 9: 5, 10:5}
df_ratings["rating"] = df_ratings.apply(lambda x:ratings_mapper[x["rating"]], axis=1)
df_ratings['rating'].value_counts()

5    6507912
1    5631614
4    4407961
3    2958053
2     587834
Name: rating, dtype: int64

In [60]:
df_ratings["is_pos"] = df_ratings.apply(lambda x:x["rating"] > 3, axis=1)
df_ratings['is_pos'].value_counts()

True     10915873
False     9177501
Name: is_pos, dtype: int64

In [61]:
df_ratings = df_ratings.groupby(["user_id", "is_pos"]).filter(lambda x: x["is_pos"].count() > 50)
len(df_ratings)

17040914

In [62]:
print(len(df_ratings["item_id"].unique()))
print(len(df_ratings["user_id"].unique()))

2627
118614


In [63]:
df_items.sort_values("prem_year", ascending=False, inplace=True)
df_items.head(10)

Unnamed: 0,item_id,item_name,pop_score,prem_year,genres
15947,40059,Golden Kamuy Season 3,1736,2020,"[Action, Adventure, Historical, Seinen]"
16968,42414,Eternity:Sweet Love Story,7027,2020,"[Ecchi, Romance]"
16256,40610,Healin' Good Pretty Cure,5931,2020,"[Action, Magic, Fantasy, Shoujo]"
16625,41380,"I'm standing on 1,000,000 lives.",1118,2020,"[Action, Game, Drama, Fantasy, Shounen]"
15418,39184,A3! Season Spring & Summer,3660,2020,"[Slice of Life, Drama]"
16245,40591,Kaguya-sama:Love is War Season 2,151,2020,"[Comedy, Psychological, Romance, School, Seinen]"
15130,38790,"BOFURI:I Don't Want to Get Hurt, so I'll Max O...",516,2020,"[Action, Game, Sci-Fi, Adventure, Comedy, Fant..."
16732,41638,Bite-Choicar,16027,2020,"[Cars, Kids]"
16426,40902,Food Wars! The Fifth Plate,524,2020,"[Ecchi, School, Shounen]"
15614,39463,Gleipnir,533,2020,"[Action, Mystery, Supernatural, Ecchi, Seinen]"


In [64]:
df_items.sort_values("pop_score", ascending=False, inplace=True)
df_items.head(10)

Unnamed: 0,item_id,item_name,pop_score,prem_year,genres
16854,42144,Jing-Ju Cats 2,17353,2017,"[Action, Adventure, Comedy, Kids, Fantasy]"
17062,42660,Robocar Poli 4,17258,2015,"[Cars, Comedy, Kids]"
17058,42654,Robocar Poli 3,17229,2014,"[Cars, Comedy, Kids]"
16856,42146,Jing-Ju Cats 3,17213,2018,"[Action, Adventure, Comedy, Kids, Martial Arts..."
16059,40273,Fuwa,17171,2007,[Historical]
17086,42740,Screechers Wild!,17160,2016,"[Action, Adventure, Cars]"
16740,41667,MONKART,17143,2017,"[Action, Adventure, Fantasy, Kids]"
16731,41635,GG Bond Season 16:Racing,17111,2019,"[Sci-Fi, Cars, Space, Super Power, Kids]"
17207,42998,Seven Lucky Gods,17039,2020,"[Comedy, Kids, Supernatural]"
14581,37941,Cocomong,16903,2008,[Kids]


In [65]:
out_dir = "/home/diego/chat-reranking/dataset/anime/"
df_ratings.to_csv(f"{out_dir}ratings17M.csv", index=False)

In [126]:
df_ratings["is_pos"].value_counts()

True     9706303
False    7334611
Name: is_pos, dtype: int64

DATASET STATISTICS:
- 17M ratings
- 2627 items
- 118614 users
- 143 ratings per user
- 6486 ratings per item
- 40 genres
- 94.5% sparsity 

In [141]:
a = pd.read_csv("/home/diego/chat-reranking/experiments/anime/fold_0/train_data.csv", sep="\t", names=["userid", "items_id", "rating"])
print(len(a))
a["rating"].value_counts()

13601564


True     11013277
False     2588287
Name: rating, dtype: int64

In [140]:
a = pd.read_csv("/home/diego/chat-reranking/experiments/anime/fold_0/train_val_data.csv", sep="\t", names=["userid", "items_id", "rating"])
print(len(a))
a["rating"].value_counts()

10881672


True     8810991
False    2070681
Name: rating, dtype: int64

In [32]:
rat = pd.read_csv("/home/diego/chat-reranking/dataset/anime/ratings17M.csv", sep=",")
print(max(rat["user_id"].unique()))
print(max(rat["item_id"].unique()))

353392
42883


<h2>Build genres from chatgpt<h2>

In [85]:
from ast import literal_eval

In [86]:
df_items = pd.read_csv("/home/diego/chat-reranking/experiments/anime/df_items.csv")
df_items["genres"] = df_items["genres"].apply(literal_eval)
df_items.columns

Index(['item_id', 'item_name', 'pop_score', 'prem_year', 'genres'], dtype='object')

In [4]:
item_names = df_items["item_name"].values

In [5]:
start = 0
end = len(item_names) 
step = 50
queries = []
for i in range(start, end, step): 
    x = i 
    queries.append(item_names[x:x+step])
print(f"# of queries: {len(queries)}")

# of queries: 55


In [24]:
import openai
import time

def query_chatgpt(user_prompt: str) -> str:
    messages = [{"role": "user", "content": user_prompt}]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        messages=messages,
        temperature=0,  # this is the degree of randomness of the model's output
    )
    output = response.choices[0].message["content"]
    return output

In [13]:
template_prompt = "You are given a list of 50 anime, delimited by triple backticks. Your task is to provide the genres for each anime in this list. Provide the output in the following format <anime name> {genres}\n\n```\n"
prompt = template_prompt
list_prompts = []
for query in queries:
    prompt = template_prompt
    for name in query:
        prompt += f"{name}\n"
    prompt += "```"
    list_prompts.append(prompt)

In [14]:
# print(list_prompts[54])

In [89]:
openai.api_key = ""
list_answers = []
with open('output_gpt.txt', 'a') as f:
    i = 0
    for p in list_prompts[29:]:
        if i % 3 == 0:
            time.sleep(120)
        out = query_chatgpt(p)
        list_answers.append(out)
        f.write(f"{out}\n")
        print(i)
        i += 1
        # time.sleep(20)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [22]:
print(f"len prompts: {len(list_prompts)}")

len prompts: 55


In [122]:
found_names = []
found_genres = []
gpt_dict = {}
with (open('raw_gpt_genres.txt', 'r') as f):
    for line in f.readlines():
        splitted = line.split('{')
        if len(splitted) > 1:  # the line contains a valid record
            i_name = splitted[0][:-1]
            i_genres = splitted[1][:-2].split(", ")  # this is a list of genres
            found_names.append(i_name)
            found_genres.append(i_genres)
            gpt_dict[i_name] = i_genres
print(f"# of names found: {len(found_names)}")
print(f"# of genres found: {len(found_genres)}")

# of names found: 2807
# of genres found: 2807


In [123]:
# now we have to check whether original item names overlap with the returned ones
print(f"# of orginal item names: {len(item_names)}")
retrieved_item_names = set(item_names).intersection(found_names)
print(f"# of found item names: {len(found_names)}")

# of orginal item names: 2711
# of found item names: 2807


In [124]:
gpt_genres = set()
for g in found_genres:
    gpt_genres = gpt_genres.union(g)
print(len(gpt_genres))

47


In [125]:
dataset_genres = set()
for g in df_items["genres"].values:
    # print(g)
    dataset_genres = dataset_genres.union(set(g))
    # print(dataset_genres)
print(len(dataset_genres))

40


In [126]:
# check the overlap between dataset genres and gpt genres
print(f"# of dataset genres: {len(dataset_genres)}")
print(f"# of gpt genres: {len(gpt_genres)}")
print(f"# of overlapping genres: {len(set(gpt_genres).intersection(dataset_genres))}")

# of dataset genres: 40
# of gpt genres: 47
# of overlapping genres: 40


In [127]:
# print the chatgpt genres
g_diff = set(gpt_genres).difference(dataset_genres)

In [129]:
g_diff

{'Family', 'Hentai', 'Medical', 'Slice of Lif', 'Unknown', 'Western', 'Yaoi'}

In [130]:
len(dataset_genres)

40

In total, GPT identifies 5 more genres = {'Family', 'Hentai', 'Medical', 'Western', 'Yaoi'}, (and an unknown one)

For each item, what is the overlap between gpt-based and dataset-based genres?

In [131]:
# load gen
dataset_dict = {}
for i, row in df_items.iterrows():
    dataset_dict[row["item_name"]] = row["genres"]
print(len(dataset_dict))

2711


In [132]:
common_names = set(df_items["item_name"].values).intersection(gpt_dict.keys())
print(f"# of common items identified: {len(common_names)}")

# of common items identified: 2711


In [133]:
import numpy as np

avg_diff = []
avg_inters = []
avg_gpt = []
avg_dataset = []
for name in common_names:
    avg_gpt.append(len(gpt_dict[name]))
    avg_dataset.append(len(dataset_dict[name]))
    avg_inters.append(len(set(dataset_dict[name]).intersection(gpt_dict[name])))
    avg_diff.append(len(set(dataset_dict[name]).union(gpt_dict[name])) - len(set(dataset_dict[name]).intersection(gpt_dict[name])))

print(f"average # of genres per item in dataset: {np.mean(avg_dataset)}")
print(f"average # of genres per item in gpt: {np.mean(avg_gpt)}")
print()
print(f"average intersection is: {np.mean(avg_inters)}")
print(f"average difference is {np.mean(avg_diff)}")

average # of genres per item in dataset: 3.968646255994098
average # of genres per item in gpt: 3.684618222058281

average intersection is: 3.102176318701586
average difference is 1.448911840649207


In [137]:
out_dir = "/home/diego/chat-reranking/experiments/anime/"
with open(f"{out_dir}itemname_to_id.pkl", 'rb') as fp:
    itemname_to_id = pickle.load(fp)

In [140]:
# prepare genres file for gpt
out_string = ""
i = 0
for name in gpt_dict:
    if name in itemname_to_id:
        iid = itemname_to_id[name]
        gs = gpt_dict[name]
        for genre in gs:
            if genre == "Slice of Lif":
                out_string += f"{iid}\tSlice of Life\n"
            else:
                out_string += f"{iid}\t{genre}\n"
            
        i+=1
print(f"# of items written: {i}")
with open(f"{out_dir}gpt_genres_file.txt", "w") as text_file:
    text_file.write(out_string)

# of items written: 2711


<h4>Some items are not successfully identified by chatgpt<h4>

In [113]:
different_names = set(df_items["item_name"].values).difference(gpt_dict.keys())
len(different_names)

296

In [119]:
different_names = list(different_names)
start = 0
end = len(different_names) 
step = 50
queries = []
for i in range(start, end, step): 
    x = i 
    queries.append(different_names[x:x+step])
print(f"# of queries: {len(queries)}")

# of queries: 6


In [120]:
template_prompt = "You are given a list of 50 anime, delimited by triple backticks. Your task is to provide the genres for each anime in this list. Provide the output in the following format <anime name> {genres}\n\n```\n"
prompt = template_prompt
list_prompts = []
for query in queries:
    prompt = template_prompt
    for name in query:
        prompt += f"{name}\n"
    prompt += "```"
    list_prompts.append(prompt)

In [121]:
openai.api_key = ""
list_answers = []
with open('output_gpt_2.txt', 'a') as f:
    i = 0
    for p in list_prompts:
        if i % 3 == 0:
            time.sleep(120)
        out = query_chatgpt(p)
        list_answers.append(out)
        f.write(f"{out}\n")
        print(i)
        i += 1
        # time.sleep(20)

0
1
2
3
4
5
