## Data Exploration

In [176]:
import pandas as pd
import xml.etree.ElementTree as ET

I exported my own MAL data around the middle of August 2022. The default format is XML.

In [177]:
path = 'animelist_1660694258_-_14855571.xml'

In [178]:
tree = ET.parse(path)
tree

<xml.etree.ElementTree.ElementTree at 0x7f9f99746ee0>

In [179]:
root = tree.getroot()

The XML is structured as follows:
- myanimelist (root)
    - myinfo
    - anime
    - anime
    - ...
    - anime
    
So we want the anime nodes

In [180]:
# see this stackoverflow answer: https://stackoverflow.com/questions/7684333/converting-xml-to-dictionary-using-elementtree

from collections import defaultdict


def etree_to_dict(t):
    d = {t.tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in map(etree_to_dict, children):
            for k, v in dc.items():
                dd[k].append(v)
        d = {t.tag: {k: v[0] if len(v) == 1 else v
                     for k, v in dd.items()}}
    if t.attrib:
        d[t.tag].update(('@' + k, v)
                        for k, v in t.attrib.items())
    if t.text:
        text = t.text.strip()
        if children or t.attrib:
            if text:
                d[t.tag]['#text'] = text
        else:
            d[t.tag] = text
    return d

In [181]:
d = etree_to_dict(root)

In [182]:
len(d['myanimelist']['anime'])

318

In [183]:
a = d['myanimelist']['anime'][0]
a

{'series_animedb_id': '48',
 'series_title': '.hack//Sign',
 'series_type': 'TV',
 'series_episodes': '26',
 'my_id': '0',
 'my_watched_episodes': '0',
 'my_start_date': '0000-00-00',
 'my_finish_date': '0000-00-00',
 'my_rated': None,
 'my_score': '0',
 'my_storage': None,
 'my_storage_value': '0.00',
 'my_status': 'Plan to Watch',
 'my_comments': None,
 'my_times_watched': '0',
 'my_rewatch_value': None,
 'my_priority': 'LOW',
 'my_tags': None,
 'my_rewatching': '0',
 'my_rewatching_ep': '0',
 'my_discuss': '1',
 'my_sns': 'default',
 'update_on_import': '0'}

In [184]:
anime = d['myanimelist']['anime']

In [185]:
df = pd.DataFrame(anime)

In [186]:
df

Unnamed: 0,series_animedb_id,series_title,series_type,series_episodes,my_id,my_watched_episodes,my_start_date,my_finish_date,my_rated,my_score,...,my_comments,my_times_watched,my_rewatch_value,my_priority,my_tags,my_rewatching,my_rewatching_ep,my_discuss,my_sns,update_on_import
0,48,.hack//Sign,TV,26,0,0,0000-00-00,0000-00-00,,0,...,,0,,LOW,,0,0,1,default,0
1,31646,3-gatsu no Lion,TV,22,0,22,0000-00-00,0000-00-00,,8,...,,0,,LOW,,0,0,1,default,0
2,35180,3-gatsu no Lion 2nd Season,TV,22,0,22,0000-00-00,0000-00-00,,8,...,,0,,LOW,,0,0,1,default,0
3,36793,3D Kanojo: Real Girl,TV,12,0,2,0000-00-00,0000-00-00,,4,...,,0,,LOW,,0,0,1,default,0
4,38101,5-toubun no Hanayome,TV,12,0,12,0000-00-00,0000-00-00,,5,...,,0,,LOW,,0,0,1,default,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,35507,Youkoso Jitsuryoku Shijou Shugi no Kyoushitsu e,TV,12,0,12,0000-00-00,0000-00-00,,8,...,,0,,LOW,,0,0,1,default,0
314,51096,Youkoso Jitsuryoku Shijou Shugi no Kyoushitsu ...,TV,13,0,0,0000-00-00,0000-00-00,,0,...,,0,,LOW,,0,0,1,default,0
315,40911,Yuukoku no Moriarty,TV,11,0,3,0000-00-00,0000-00-00,,5,...,,0,,LOW,,0,0,1,default,0
316,23283,Zankyou no Terror,TV,11,0,0,0000-00-00,0000-00-00,,0,...,,0,,LOW,,0,0,1,default,0


In [187]:
# massage the data a little
df.my_score = pd.to_numeric(df.my_score)

MAL assigns a score of 0 to anime that I have not rated

In [188]:
df[df.my_score > 0].sort_values(by='my_score', ascending=False)

Unnamed: 0,series_animedb_id,series_title,series_type,series_episodes,my_id,my_watched_episodes,my_start_date,my_finish_date,my_rated,my_score,...,my_comments,my_times_watched,my_rewatch_value,my_priority,my_tags,my_rewatching,my_rewatching_ep,my_discuss,my_sns,update_on_import
74,5114,Fullmetal Alchemist: Brotherhood,TV,64,0,64,0000-00-00,0000-00-00,,10,...,,0,,LOW,,0,0,1,default,0
145,36098,Kimi no Suizou wo Tabetai,Movie,1,0,1,0000-00-00,0000-00-00,,10,...,,0,,LOW,,0,0,1,default,0
58,1535,Death Note,TV,37,0,37,0000-00-00,0000-00-00,,10,...,,0,,LOW,,0,0,1,default,0
144,32281,Kimi no Na wa.,Movie,1,0,1,0000-00-00,0000-00-00,,10,...,,0,,LOW,,0,0,1,default,0
6,41457,86,TV,11,0,11,0000-00-00,0000-00-00,,10,...,,0,,LOW,,0,0,1,default,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,4224,Toradora!,TV,25,0,11,0000-00-00,0000-00-00,,1,...,,0,,LOW,,0,0,1,default,0
84,245,Great Teacher Onizuka,TV,43,0,0,0000-00-00,0000-00-00,,1,...,,0,,LOW,,0,0,1,default,0
143,6045,Kimi ni Todoke,TV,25,0,0,0000-00-00,0000-00-00,,1,...,,0,,LOW,,0,0,1,default,0
268,14289,Sukitte Ii na yo.,TV,13,0,0,0000-00-00,0000-00-00,,1,...,,0,,LOW,,0,0,1,default,0


In [220]:
df.columns

Index(['series_animedb_id', 'series_title', 'series_type', 'series_episodes',
       'my_id', 'my_watched_episodes', 'my_start_date', 'my_finish_date',
       'my_rated', 'my_score', 'my_storage', 'my_storage_value', 'my_status',
       'my_comments', 'my_times_watched', 'my_rewatch_value', 'my_priority',
       'my_tags', 'my_rewatching', 'my_rewatching_ep', 'my_discuss', 'my_sns',
       'update_on_import'],
      dtype='object')

So I've added just over 300 anime and rated 200 of them

## Using the MAL API

The MAL API is public for public data such as info about shows. See this forum post: https://myanimelist.net/forum/?topicid=1973077
The header value can be found here: https://github.com/SuperMarcus/myanimelist-api-specification

In [190]:
import requests

In [191]:
def query_mal(show_id):
    """The show's ID will be a string representing an integer"""
    
    # Android app
    MAL_CLIENT_ID = '6114d00ca681b7701d1e15fe11a4987e'
    
     # 'https://api.myanimelist.net/v2/anime/30230?fields=id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,my_list_status,num_episodes,start_season,broadcast,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics' \
    url = f'https://api.myanimelist.net/v2/anime/{show_id}?fields=id,title,genres,alternative_titles,start_date,end_date,mean,studios,rank,popularity,num_list_users,num_scoring_users,nsfw,status,my_list_status,num_episodes,start_season'
    print(url)
    r = requests.get(url, headers={
        'X-MAL-Client-ID': MAL_CLIENT_ID,
    })
    return r.json()

# query_mal(23273)

In [192]:
!mkdir data

mkdir: data: File exists


In [196]:
!mkdir data/anime-info

mkdir: data/anime-info: File exists


In [197]:
import json
from collections import Counter
import os

In [195]:
# there are 318 anime in my watch list
# I can probably just get the data for all of them
df.shape

(318, 23)

Grab every conceivable interesting bit from MAL

In [198]:
def read_anime_stats(id):
    fname = f'data/anime-info/{id}.json'
    stats = {}
    with open(fname) as fp:
        stats = json.load(fp)
    return stats

def save_anime_stats(id, stats):
    fname = f'data/anime-info/{id}.json'
    with open(fname, 'w') as fp:
        json.dump(stats, fp, indent=4, sort_keys=True)

In [202]:
def crawl_mal(df):
    """The parameter is a dataframe which must have a row called series_animedb_id"""
    
    anime_info = {}
    
    for i, row in df.iterrows():
        id = row['series_animedb_id']
        fname = f'data/anime-info/{id}.json'
        if os.path.exists(fname):
            # print(f'Already collected info for anime {id}')
            v = read_anime_stats(id)
        else:
            v = query_mal(id)
            save_anime_stats(id, v)

        anime_info[id] = v
            
    return anime_info

anime_info = crawl_mal(df)
len(anime_info)

318

In [203]:
k = list(anime_info.keys())[1]
anime_info[k]

{'alternative_titles': {'en': 'March Comes In Like a Lion',
  'ja': '3月のライオン',
  'synonyms': ['Sangatsu no Lion']},
 'end_date': '2017-03-18',
 'genres': [{'id': 53, 'name': 'Childcare'},
  {'id': 8, 'name': 'Drama'},
  {'id': 63, 'name': 'Iyashikei'},
  {'id': 42, 'name': 'Seinen'},
  {'id': 36, 'name': 'Slice of Life'},
  {'id': 11, 'name': 'Strategy Game'}],
 'id': 31646,
 'main_picture': {'large': 'https://api-cdn.myanimelist.net/images/anime/6/82898l.jpg',
  'medium': 'https://api-cdn.myanimelist.net/images/anime/6/82898.jpg'},
 'mean': 8.39,
 'nsfw': 'white',
 'num_episodes': 22,
 'num_list_users': 616594,
 'num_scoring_users': 256533,
 'popularity': 279,
 'rank': 169,
 'start_date': '2016-10-08',
 'start_season': {'season': 'fall', 'year': 2016},
 'status': 'finished_airing',
 'studios': [{'id': 44, 'name': 'Shaft'}],
 'title': '3-gatsu no Lion'}

In [204]:
df[df.series_animedb_id == '31646'].my_score

1    8
Name: my_score, dtype: int64

In [205]:
# compute the frequency of each tag in the list

def get_genre_freq(anime_info: dict, subset = None):
    # anime info should map the anime's ID to its information from MAL
    # the subset should be a set of IDs (or None)
    assert isinstance(anime_info, dict)
    
    genre_counts = Counter()

    for id, v in anime_info.items():
        if subset and id not in subset:
            continue
        # count each genre here once
        for genre in v['genres']:
            name = genre['name']
            genre_counts.setdefault(name, 0)
            genre_counts[name] += 1

    return genre_counts
            
genre_counts = get_genre_freq(anime_info)
genre_counts.most_common(20)

[('Romance', 154),
 ('Drama', 131),
 ('School', 128),
 ('Comedy', 117),
 ('Shounen', 69),
 ('Action', 59),
 ('Slice of Life', 57),
 ('Fantasy', 53),
 ('Supernatural', 53),
 ('Mystery', 34),
 ('Sci-Fi', 33),
 ('Seinen', 33),
 ('Psychological', 33),
 ('Adventure', 28),
 ('Love Polygon', 28),
 ('Ecchi', 26),
 ('Adult Cast', 21),
 ('Shoujo', 20),
 ('Harem', 20),
 ('Suspense', 19)]

In [206]:
# in my dataset, Romance and Drama are over-represented
# let's say "good" for me is 8 or higher

good_anime = df[df.my_score >= 8]
len(good_anime)

51

In [207]:
good_anime_ids = set(good_anime.series_animedb_id.values)
# good_anime_ids
good_genre_counts = get_genre_freq(anime_info, good_anime_ids)
len(good_genre_counts)

41

In [208]:
good_genre_counts.most_common(20)

[('Drama', 28),
 ('Romance', 20),
 ('School', 16),
 ('Slice of Life', 13),
 ('Supernatural', 11),
 ('Fantasy', 9),
 ('Comedy', 9),
 ('Action', 8),
 ('Shounen', 8),
 ('Mystery', 7),
 ('Seinen', 6),
 ('Adventure', 5),
 ('Iyashikei', 4),
 ('Sci-Fi', 4),
 ('Vampire', 4),
 ('Psychological', 4),
 ('Love Polygon', 4),
 ('Childcare', 3),
 ('Military', 3),
 ('Visual Arts', 3)]

In [209]:
g = pd.DataFrame(data = genre_counts.items(), columns=['genre', 'all_anime_freq']).sort_values('genre').set_index('genre')
g

Unnamed: 0_level_0,all_anime_freq
genre,Unnamed: 1_level_1
Action,59
Adult Cast,21
Adventure,28
Anthropomorphic,2
Avant Garde,4
...,...
Time Travel,13
Vampire,7
Video Game,4
Visual Arts,10


In [224]:
g2 = pd.DataFrame(data=good_genre_counts.items(), columns=['genre', 'good_anime_freq']).sort_values('genre').set_index('genre')
g2.sort_values(by='good_anime_freq', ascending=False).head(10)

Unnamed: 0_level_0,good_anime_freq
genre,Unnamed: 1_level_1
Drama,28
Romance,20
School,16
Slice of Life,13
Supernatural,11
Comedy,9
Fantasy,9
Action,8
Shounen,8
Mystery,7


I'm surprised there are so few Mystery anime here

In [222]:
g2.shape

(41, 1)

In [211]:
g3 = g.join(g2).fillna(0)
g3

Unnamed: 0_level_0,all_anime_freq,good_anime_freq
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Action,59,8.0
Adult Cast,21,0.0
Adventure,28,5.0
Anthropomorphic,2,1.0
Avant Garde,4,0.0
...,...,...
Time Travel,13,2.0
Vampire,7,4.0
Video Game,4,0.0
Visual Arts,10,3.0


In [215]:

g3['good_ratio'] = g3.good_anime_freq / g3.all_anime_freq
# add 1 to denominator to avoid small values dominating results
g3['good_ratio_adj'] = g3.good_anime_freq / (g3.all_anime_freq + 2)


g3.sort_values('good_ratio_adj', ascending=False).head(7)

Unnamed: 0_level_0,all_anime_freq,good_anime_freq,good_ratio,good_ratio_adj
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Vampire,7,4.0,0.571429,0.444444
Award Winning,1,1.0,1.0,0.333333
Iyashikei,11,4.0,0.363636,0.307692
Childcare,9,3.0,0.333333,0.272727
Anthropomorphic,2,1.0,0.5,0.25
Visual Arts,10,3.0,0.3,0.25
Strategy Game,7,2.0,0.285714,0.222222


So these are my top anime tags?

In [226]:
def find_all_with_theme(theme):
    for id, v in anime_info.items():
        for genre in v['genres']:
            if genre['name'] == theme:
                yield (v['title'], get_english_title(id))
    
adult_cast = list(find_all_with_theme('Adult Cast'))
adult_cast

[('Black Lagoon', 'Black Lagoon'),
 ('Danna ga Nani wo Itteiru ka Wakaranai Ken',
  "I Can't Understand What My Husband Is Saying"),
 ('Death Parade', 'Death Parade'),
 ('Gokushufudou', 'The Way of the Househusband'),
 ('Jin-Rou', 'Jin-Roh: The Wolf Brigade'),
 ('Koukaku Kidoutai: Stand Alone Complex',
  'Ghost in the Shell: Stand Alone Complex'),
 ('Log Horizon', 'Log Horizon'),
 ('Monster', 'Monster'),
 ('Nana', 'Nana'),
 ('Natsuyuki Rendezvous', 'Natsuyuki Rendezvous'),
 ('Net-juu no Susume', 'Recovery of an MMO Junkie'),
 ('Ookami to Koushinryou', 'Spice and Wolf'),
 ('Paprika', 'Paprika'),
 ('Paripi Koumei', 'Ya Boy Kongming!'),
 ('Perfect Blue', 'Perfect Blue'),
 ('Psycho-Pass', 'Psycho-Pass'),
 ('Seikaisuru Kado', 'KADO: The Right Answer'),
 ('Shirobako', 'Shirobako'),
 ('Shouwa Genroku Rakugo Shinjuu: Sukeroku Futatabi-hen',
  'Descending Stories: Showa Genroku Rakugo Shinju'),
 ('Uchuu Kyoudai', 'Space Brothers'),
 ('Wotaku ni Koi wa Muzukashii', 'Wotakoi: Love is Hard for Ota

For some reason, Golden Time doesn't have the "Adult Cast" tag. Also Bakuman season 3. Also Golden Time, and the other university anime. And anime where the characters age in, like The Wind Rises.

In [217]:
list(find_all_with_theme('Award Winning'))

[('Sen to Chihiro no Kamikakushi', 'Spirited Away')]

Spirited Away is the only award winning anime on MAL?

In [218]:
list(find_all_with_theme('Vampire'))

[('Bakemonogatari', 'Bakemonogatari'),
 ('Kizumonogatari I: Tekketsu-hen', 'Kizumonogatari Part 1: Iron-Blooded'),
 ('Kizumonogatari II: Nekketsu-hen', 'Kizumonogatari Part 2: Hot-Blooded'),
 ('Kizumonogatari III: Reiketsu-hen', 'Kizumonogatari Part 3: Cold-Blooded'),
 ('Monogatari Series: Second Season', 'Monogatari Series: Second Season'),
 ('Owarimonogatari', 'Owarimonogatari'),
 ('Vanitas no Karte', 'The Case Study of Vanitas')]

It makes sense why I like Vampire films - basically every vampire anime on my list is from Monogatari

In [219]:
list(find_all_with_theme('Iyashikei'))

[('3-gatsu no Lion', 'March Comes In Like a Lion'),
 ('3-gatsu no Lion 2nd Season', 'March Comes In Like A Lion 2nd Season'),
 ('Acchi Kocchi', 'Place to Place'),
 ('Aharen-san wa Hakarenai', 'Aharen-san wa Hakarenai'),
 ('Amaama to Inazuma', 'Sweetness & Lightning'),
 ('Barakamon', 'Barakamon'),
 ('Flying Witch', 'Flying Witch'),
 ('Honobono Log', ''),
 ('Karakai Jouzu no Takagi-san 2', 'Teasing Master Takagi-san 2'),
 ('Tanaka-kun wa Itsumo Kedaruge', 'Tanaka-kun is Always Listless'),
 ('Usagi Drop', 'Bunny Drop')]

How the f is 3-gatsu a healing anime??

In [225]:
list(find_all_with_theme('Mystery'))

[('.hack//Sign', '.hack//Sign'),
 ('Ajin', 'Ajin: Demi-Human'),
 ('Another', 'Another'),
 ('Bakemonogatari', 'Bakemonogatari'),
 ('Boku dake ga Inai Machi', 'ERASED'),
 ('ef: A Tale of Memories.', 'ef - a tale of memories.'),
 ('Gosick', ''),
 ('Great Pretender', ''),
 ('Higashi no Eden', 'Eden of The East'),
 ('Houseki no Kuni', 'Land of the Lustrous'),
 ('Hyouka', 'Hyouka'),
 ('Kaiba', 'Kaiba'),
 ('Kakegurui', 'Kakegurui'),
 ('Kizumonogatari II: Nekketsu-hen', 'Kizumonogatari Part 2: Hot-Blooded'),
 ('Kizumonogatari III: Reiketsu-hen', 'Kizumonogatari Part 3: Cold-Blooded'),
 ('Monogatari Series: Second Season', 'Monogatari Series: Second Season'),
 ('Monster', 'Monster'),
 ('Nisemonogatari', 'Nisemonogatari'),
 ('Odd Taxi', 'Odd Taxi'),
 ('Owarimonogatari', 'Owarimonogatari'),
 ('Paprika', 'Paprika'),
 ('Sakurada Reset', 'Sagrada Reset'),
 ('Serial Experiments Lain', 'Serial Experiments Lain'),
 ('Shinsekai yori', 'From the New World'),
 ('Summertime Render', 'Summer Time Rendering'

In [227]:
# how is this not a mystery?
anime_info['37450']

{'alternative_titles': {'en': 'Rascal Does Not Dream of Bunny Girl Senpai',
  'ja': '青春ブタ野郎はバニーガール先輩の夢を見ない',
  'synonyms': ['AoButa']},
 'end_date': '2018-12-27',
 'genres': [{'id': 4, 'name': 'Comedy'},
  {'id': 8, 'name': 'Drama'},
  {'id': 22, 'name': 'Romance'},
  {'id': 23, 'name': 'School'},
  {'id': 37, 'name': 'Supernatural'}],
 'id': 37450,
 'main_picture': {'large': 'https://api-cdn.myanimelist.net/images/anime/1301/93586l.jpg',
  'medium': 'https://api-cdn.myanimelist.net/images/anime/1301/93586.jpg'},
 'mean': 8.26,
 'nsfw': 'white',
 'num_episodes': 13,
 'num_list_users': 1502209,
 'num_scoring_users': 961555,
 'popularity': 54,
 'rank': 267,
 'start_date': '2018-10-04',
 'start_season': {'season': 'fall', 'year': 2018},
 'status': 'finished_airing',
 'studios': [{'id': 1835, 'name': 'CloverWorks'}],
 'title': 'Seishun Buta Yarou wa Bunny Girl Senpai no Yume wo Minai'}

## Biggest Controversial Picks

What anime do I like that others dislike?

In [105]:
# for every anime for which I have a rating, extract that rating
rated_df = df[df.my_score > 0].copy()
rated_df.shape

(200, 23)

In [106]:
# OK we have a rating for 200 anime
pd.unique(rated_df.my_status)

array(['Completed', 'Dropped', 'On-Hold', 'Watching'], dtype=object)

I rated a few anime that I haven't finished watching. Let's see what those are.

In [107]:
rated_df[rated_df.my_status == 'Watching']

Unnamed: 0,series_animedb_id,series_title,series_type,series_episodes,my_id,my_watched_episodes,my_start_date,my_finish_date,my_rated,my_score,...,my_comments,my_times_watched,my_rewatch_value,my_priority,my_tags,my_rewatching,my_rewatching_ep,my_discuss,my_sns,update_on_import
269,47194,Summertime Render,TV,25,0,14,0000-00-00,0000-00-00,,9,...,,0,,LOW,,0,0,1,default,0


In [108]:
rated_df[rated_df.my_status == 'On-Hold']

Unnamed: 0,series_animedb_id,series_title,series_type,series_episodes,my_id,my_watched_episodes,my_start_date,my_finish_date,my_rated,my_score,...,my_comments,my_times_watched,my_rewatch_value,my_priority,my_tags,my_rewatching,my_rewatching_ep,my_discuss,my_sns,update_on_import
234,34102,Sakurada Reset,TV,24,0,4,0000-00-00,0000-00-00,,5,...,,0,,LOW,,0,0,0,default,0


I think that's fine

In [109]:
mean_rating = rated_df.series_animedb_id.apply(lambda id: anime_info[id]['mean'])
mean_rating

1      8.39
2      8.95
3      6.84
4      7.68
5      8.11
       ... 
308    8.04
310    7.57
313    7.86
315    8.15
317    7.23
Name: series_animedb_id, Length: 200, dtype: float64

In [110]:
rated_df['mean_rating'] = mean_rating

In [111]:
rated_df['rating_diff'] = rated_df.my_score - rated_df.mean_rating

These are the ones I most overvalue relative to actual rating.

No surprise that these are my 9s and 10s

In [121]:
rated_df.sort_values('rating_diff', ascending=False).head(20)\
[['series_animedb_id', 'series_title', 'my_score', 'mean_rating', 'rating_diff']]

Unnamed: 0,series_animedb_id,series_title,my_score,mean_rating,rating_diff
6,41457,86,10,8.25,1.75
26,38753,Araburu Kisetsu no Otome-domo yo.,9,7.37,1.63
100,578,Hotaru no Haka,10,8.5,1.5
145,36098,Kimi no Suizou wo Tabetai,10,8.56,1.44
58,1535,Death Note,10,8.62,1.38
300,18245,White Album 2,9,7.63,1.37
244,23273,Shigatsu wa Kimi no Uso,10,8.66,1.34
241,199,Sen to Chihiro no Kamikakushi,10,8.78,1.22
144,32281,Kimi no Na wa.,10,8.86,1.14
105,12189,Hyouka,9,8.09,0.91


The better question is what do I think is decent that MAL really doesn't like?

In [122]:
rated_df[(rated_df.my_score >= 7) & (rated_df.mean_rating < 7)]

Unnamed: 0,series_animedb_id,series_title,series_type,series_episodes,my_id,my_watched_episodes,my_start_date,my_finish_date,my_rated,my_score,...,my_rewatch_value,my_priority,my_tags,my_rewatching,my_rewatching_ep,my_discuss,my_sns,update_on_import,mean_rating,rating_diff


Absolutely nothing

In [123]:
rated_df[(rated_df.my_score >= 8) & (rated_df.mean_rating < 7.5)]

Unnamed: 0,series_animedb_id,series_title,series_type,series_episodes,my_id,my_watched_episodes,my_start_date,my_finish_date,my_rated,my_score,...,my_rewatch_value,my_priority,my_tags,my_rewatching,my_rewatching_ep,my_discuss,my_sns,update_on_import,mean_rating,rating_diff
26,38753,Araburu Kisetsu no Otome-domo yo.,TV,12,0,12,0000-00-00,0000-00-00,,9,...,,LOW,,0,0,1,default,0,7.37,1.63
97,33241,Honobono Log,TV,10,0,10,0000-00-00,0000-00-00,,8,...,,LOW,,0,0,1,default,0,7.35,0.65
227,38992,Rikei ga Koi ni Ochita no de Shoumei shitemita.,TV,12,0,12,0000-00-00,0000-00-00,,8,...,,LOW,,0,0,1,default,0,7.37,0.63
277,47159,Tensai Ouji no Akaji Kokka Saisei Jutsu,TV,12,0,12,0000-00-00,0000-00-00,,8,...,,LOW,,0,0,1,default,0,7.41,0.59


In [127]:
def get_english_title(id):
    return anime_info[id]['alternative_titles']['en']

# I'm not good with the Japanese titles
get_english_title('38753')

'O Maidens in Your Savage Season'

In [128]:
get_english_title('47159')

"The Genius Prince's Guide to Raising a Nation Out of Debt"

In [129]:
get_english_title('38992')

'Science Fell in Love, So I Tried to Prove It'

### most trollish opinions

Now for the other direction. I think this is going to be a bunch of Tsundere bullshit

In [175]:
rated_df.sort_values('rating_diff').head(20)\
[['series_animedb_id', 'series_title', 'my_score', 'mean_rating', 'rating_diff']]

Unnamed: 0,series_animedb_id,series_title,my_score,mean_rating,rating_diff
84,245,Great Teacher Onizuka,1,8.69,-7.69
287,4224,Toradora!,1,8.1,-7.1
126,7054,Kaichou wa Maid-sama!,1,8.01,-7.01
143,6045,Kimi ni Todoke,1,8.0,-7.0
130,8525,Kami nomi zo Shiru Sekai,1,7.67,-6.67
268,14289,Sukitte Ii na yo.,1,7.43,-6.43
186,41619,Munou na Nana,1,7.2,-6.2
284,14227,Tonari no Kaibutsu-kun,2,7.49,-5.49
229,23277,Saenai Heroine no Sodatekata,2,7.49,-5.49
142,147,Kimi ga Nozomu Eien,2,7.2,-5.2


In [None]:
rated_df['english_title'] = rated_df.series_animedb_id.apply(get_english_title)

## other people's MAL

There may be some bias in my MAL (list). So to normalize let's get some other people's lists.

In [268]:
!mkdir data/other-peoples-lists

In [269]:
def save_anime_list(username: str, d: dict):
    with open(f'data/other-peoples-lists/{username}.json', 'w') as fp:
        json.dump(d, fp, indent=4, sort_keys=True)
        
def read_anime_list(username: str) -> dict:
    d = {}
    with open(f'data/other-peoples-lists/{username}.json') as fp:
        d = json.load(fp)
    return d

def fetch_other_user_list(username, limit=400):

    MAL_CLIENT_ID = '6114d00ca681b7701d1e15fe11a4987e'
    # url = f'https://api.myanimelist.net/v2/users/{username}/animelist?fields=list_status&limit={limit}&status=completed'
    # only fetch completed anime
    url = f'https://api.myanimelist.net/v2/users/{username}/animelist?fields=list_status&limit={limit}'
    print(url)
    r = requests.get(url, headers={
        'X-MAL-Client-ID': MAL_CLIENT_ID,
    })
    anime_list = r.json()
    save_anime_list(username, anime_list)
    return anime_list

In [270]:
# random guy on Reddit
l = fetch_other_user_list('chiliehead')
len(l)

https://api.myanimelist.net/v2/users/chiliehead/animelist?fields=list_status&limit=400


2

In [271]:
# another random guy on reddit
l = fetch_other_user_list('RiverSorcerer')
len(l)

https://api.myanimelist.net/v2/users/RiverSorcerer/animelist?fields=list_status&limit=400


2

In [272]:
# me
l = fetch_other_user_list('mister_baseball')
len(l)

https://api.myanimelist.net/v2/users/mister_baseball/animelist?fields=list_status&limit=400


2

In [273]:
l.keys()

dict_keys(['data', 'paging'])

In [274]:
len(l['data'])

315

In [275]:
l = fetch_other_user_list('Nomar_95')
len(l)

https://api.myanimelist.net/v2/users/Nomar_95/animelist?fields=list_status&limit=400


2

In [276]:
# few more usernames
# LeonKevlar does stitches
for username in ['awspear', 'Aochhi', '20thcbnow', 'aniMayor', 'Blueserphant', 'Kendots', 'scot911', 'LeonKevlar', 'marcopolos059']:
    fetch_other_user_list(username)

https://api.myanimelist.net/v2/users/awspear/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/Aochhi/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/20thcbnow/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/aniMayor/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/Blueserphant/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/Kendots/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/scot911/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/LeonKevlar/animelist?fields=list_status&limit=400
https://api.myanimelist.net/v2/users/marcopolos059/animelist?fields=list_status&limit=400


In [280]:
other_lists = {}

for fname in os.listdir('data/other-peoples-lists'):
    username, ext = os.path.splitext(fname)
    if username != 'mister_baseball':
        other_lists[username] = read_anime_list(username)
    print(username)
    
len(other_lists)

Nomar_95
RiverSorcerer
LeonKevlar
chiliehead
marcopolos059
mister_baseball
Aochhi
aniMayor
20thcbnow
Blueserphant
Kendots
awspear
scot911


12

The result has these main fields

- data -> a list
    - each entry has elements list_status and node
        - node has information about the actual anime. We care about the id
- paging

In [285]:
other_lists['LeonKevlar']['data'][0]['node']

{'id': 41380,
 'main_picture': {'large': 'https://api-cdn.myanimelist.net/images/anime/1506/117717l.jpg',
  'medium': 'https://api-cdn.myanimelist.net/images/anime/1506/117717.jpg'},
 'title': '100-man no Inochi no Ue ni Ore wa Tatteiru'}

In [287]:
all_ids = Counter()

for username, other_list in other_lists.items():
    for anime in other_list['data']:
        id = anime['node']['id']
        all_ids.setdefault(id, 0)
        all_ids[id] += 1

# for each item
all_ids.most_common(20)

[(8676, 11),
 (6547, 11),
 (24833, 11),
 (2251, 11),
 (889, 11),
 (31964, 11),
 (33486, 11),
 (1575, 11),
 (1, 11),
 (10087, 11),
 (31646, 10),
 (41457, 10),
 (41433, 10),
 (477, 10),
 (5081, 10),
 (31043, 10),
 (2167, 10),
 (790, 10),
 (11741, 10),
 (5114, 10)]

In [290]:
all_ids

Counter({3914: 3,
         31646: 10,
         35180: 6,
         28789: 2,
         38101: 7,
         39783: 3,
         48548: 2,
         41457: 10,
         48569: 6,
         32998: 9,
         49: 2,
         33337: 7,
         39610: 1,
         11759: 7,
         311: 4,
         313: 2,
         314: 2,
         312: 2,
         39790: 8,
         39093: 1,
         1292: 4,
         32977: 2,
         36904: 3,
         38815: 1,
         34881: 8,
         101: 6,
         31580: 2,
         33253: 1,
         30123: 8,
         31173: 5,
         283: 4,
         22199: 9,
         25013: 7,
         47: 7,
         40054: 3,
         36864: 6,
         16201: 5,
         41433: 10,
         19429: 2,
         22729: 5,
         27655: 2,
         1177: 7,
         17901: 2,
         8676: 11,
         11235: 5,
         9925: 6,
         22147: 7,
         2356: 2,
         24531: 2,
         6547: 11,
         10067: 2,
         5251: 4,
         3080: 3,
         9989: 

In [293]:
other_anime_df = pd.DataFrame(data=all_ids.items(), columns=['series_animedb_id', 'count'])
other_anime_df

Unnamed: 0,series_animedb_id,count
0,3914,3
1,31646,10
2,35180,6
3,28789,2
4,38101,7
...,...,...
2299,14653,1
2300,16982,1
2301,9958,1
2302,23153,1


In [294]:
crawl_mal(other_anime_df)

https://api.myanimelist.net/v2/anime/3914?fields=id,title,genres,alternative_titles,start_date,end_date,mean,studios,rank,popularity,num_list_users,num_scoring_users,nsfw,status,my_list_status,num_episodes,start_season
https://api.myanimelist.net/v2/anime/28789?fields=id,title,genres,alternative_titles,start_date,end_date,mean,studios,rank,popularity,num_list_users,num_scoring_users,nsfw,status,my_list_status,num_episodes,start_season
https://api.myanimelist.net/v2/anime/48548?fields=id,title,genres,alternative_titles,start_date,end_date,mean,studios,rank,popularity,num_list_users,num_scoring_users,nsfw,status,my_list_status,num_episodes,start_season
https://api.myanimelist.net/v2/anime/49?fields=id,title,genres,alternative_titles,start_date,end_date,mean,studios,rank,popularity,num_list_users,num_scoring_users,nsfw,status,my_list_status,num_episodes,start_season
https://api.myanimelist.net/v2/anime/33337?fields=id,title,genres,alternative_titles,start_date,end_date,mean,studios,rank,p

KeyboardInterrupt: 