In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import datetime
from tqdm import tqdm
from copy import deepcopy

%config InlineBackend.figure_format = 'svg'
%matplotlib inline
sns.set(color_codes=True)
plt.style.use('seaborn-colorblind')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 2)

## Data loading/pickling ##

In [20]:
# combined_animelist contains all 120,000 user reviews, but many of them were not scraped properly or they
# have no reviews at all
with open(f'../pickles/combined_animelist.pkl', 'rb') as to_read:
    combined_animelists = pickle.load(to_read)

In [68]:
with open(f'../pickles/top_anime_data_1000_df.pkl', 'rb') as to_read:
    top_anime_df = pickle.load(to_read)

In [83]:
with open(f'../pickles/user_score_dicts_all_anime.pkl', 'wb') as to_write:
    pickle.dump(user_score_dicts, to_write)

In [91]:
with open(f'../pickles/user_score_dicts_top_1000_anime.pkl', 'wb') as to_write:
    pickle.dump(user_score_dicts_top_1000_anime, to_write)

In [80]:
top_1000_anime_titles = top_anime_df['title_main'].to_list()

## Data cleaning ##
1. Filter out any invalid animelists (where there are null entries or where the web scraping didn't work).
2. For any animelist where I got the anime titles but not the scores, I need to fill in the scores with 0s. 
3. Need to convert all scores that are dashes into 0s. 

In [23]:
# Filter for only the animelists with at least some valid data (no null values or weird data entries)
valid_animelists = [animelist for animelist in combined_animelists
                    if animelist['animelist_titles'] != ['${ item.anime_title }']
                    and animelist['animelist_titles']]

In [26]:
# Find user_ids for animelists where the len of animelist_titles != len of animelist_scores
# since I need to input 0s for those records
user_ids_lens_not_matching = [animelist['user_id'] for animelist in valid_animelists
                              if len(animelist['animelist_titles']) != len(animelist['animelist_scores'])]

In [28]:
len(user_ids_lens_not_matching)

348

In [33]:
valid_animelists[0]

{'user_id': 'cindia',
 'animelist_url': 'https://myanimelist.net/animelist/cindia',
 'animelist_titles': ['Koisuru Tenshi Angelique: Kokoro no Mezameru Toki',
  'Kiniro no Corda: Primo Passo'],
 'animelist_scores': ['-', '9']}

In [38]:
user_ids_lens_not_matching[20]

'Gecata_'

In [39]:
for animelist in valid_animelists:
    if animelist['user_id'] == 'Gecata_':
        animelist

{'user_id': 'Gecata_',
 'animelist_url': 'https://myanimelist.net/animelist/Gecata_',
 'animelist_titles': ['Berserk',
  'Devilman: Crybaby',
  'Naruto: Shippuuden',
  'One Piece',
  'Accel World',
  'Accel World EX',
  'Afro Samurai',
  'Afro Samurai: Resurrection',
  'Air Gear',
  'Air Gear Special',
  'Air Gear: Kuro no Hane to Nemuri no Mori - Break on the Sky',
  'Akira',
  'Angel Beats!',
  'Angel Beats!: Another Epilogue',
  'Aoi Bungaku Series',
  'Appleseed',
  'Appleseed (Movie)',
  'Appleseed Saga Ex Machina',
  'Arakawa Under the Bridge',
  'Arakawa Under the Bridge x Bridge',
  'Bakemonogatari',
  'Bakuman.',
  'Bakuman. 2nd Season',
  'Bakuman. 3rd Season',
  'Basilisk: Kouga Ninpou Chou',
  'Bastard!!: Ankoku no Hakaishin',
  'Batman: Gotham Knight',
  'Berserk: Ougon Jidai-hen I - Haou no Tamago',
  'Berserk: Ougon Jidai-hen II - Doldrey Kouryaku',
  'Berserk: Ougon Jidai-hen III - Kourin',
  'Biohazard: Degeneration',
  'Black Lagoon',
  "Black Lagoon: Roberta's Blood 

In [34]:
# For any animelists where I was able to scrape the titles, but not the scores 
# (hence their lengths are mismatched), fill in the scores with 0s to match the lengths of the titles. 
for animelist in valid_animelists:
    if animelist['user_id'] in user_ids_lens_not_matching:
        animelist['animelist_scores'] = list(np.zeros(len(animelist['animelist_titles']), dtype=int))

In [54]:
sum([len(animelist['animelist_titles']) != len(animelist['animelist_scores']) 
     for animelist in valid_animelists])

0

In [19]:
list(np.zeros(10, dtype=int))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [41]:
list1 = ['Pokemon']
list2 = ['5']
dict1 = dict(zip(list1, list2))
dict1

{'Pokemon': '5'}

In [52]:
if dict1['charmander']:
    print(True)

KeyError: 'charmander'

# Preparing user-review matrix

In [50]:
# Get unique list of all anime titles in my valid_animelists
all_unique_anime_titles = {title for animelist in valid_animelists
                           for title in animelist['animelist_titles']}

In [57]:
# Create augmented valid_animelists where I add keys for each anime title with the user's 
# corresponding score
augmented_valid_animelists = deepcopy(valid_animelists)
for animelist in augmented_valid_animelists:
    for anime_title, anime_score in zip(animelist['animelist_titles'], animelist['animelist_scores']):
        animelist[anime_title] = anime_score

In [64]:
# Create list of user_score_dicts comprised of dicts where each dict has the user_id, 
# animelist_url, and a key-value pair for each anime in all_unique_anime_titles
user_score_dicts = []
for animelist in tqdm(augmented_valid_animelists):
    user_score_dict = {}
    user_score_dict['user_id'] = animelist['user_id']
    user_score_dict['animelist_url'] = animelist['animelist_url']
    for anime_title in all_unique_anime_titles:
        # If the user has a score for that anime, then assign that user score
        if anime_title in animelist:
            user_score_dict[anime_title] = animelist[anime_title]
        # Otherwise, just assign 0, which means the user did not have the anime in the animelist
        # or the user did not score the anime
        else:
            user_score_dict[anime_title] = 0
    user_score_dicts.append(user_score_dict)  

100%|██████████| 32987/32987 [01:36<00:00, 342.43it/s]


In [84]:
# Same as above except only creating matrix for top 1,000 anime
user_score_dicts_top_1000_anime = []
for animelist in tqdm(augmented_valid_animelists):
    user_score_dict = {}
    user_score_dict['user_id'] = animelist['user_id']
    user_score_dict['animelist_url'] = animelist['animelist_url']
    for anime_title in top_1000_anime_titles:
        # If the user has a score for that anime, then assign that user score
        if anime_title in animelist:
            user_score_dict[anime_title] = animelist[anime_title]
        # Otherwise, just assign 0, which means the user did not have the anime in the animelist
        # or the user did not score the anime
        else:
            user_score_dict[anime_title] = 0
    user_score_dicts_top_1000_anime.append(user_score_dict) 

100%|██████████| 32987/32987 [00:07<00:00, 4629.12it/s]


In [89]:
user_score_dicts_top_1000_anime[23]

{'user_id': 'Arbalest270',
 'animelist_url': 'https://myanimelist.net/animelist/Arbalest270',
 'Fullmetal Alchemist: Brotherhood': 0,
 'Steins;Gate': 0,
 'Hunter x Hunter (2011)': 0,
 'Ginga Eiyuu Densetsu': 0,
 'Gintama°': 0,
 "Gintama'": 0,
 'Shingeki no Kyojin Season 3 Part 2': 0,
 "Gintama': Enchousen": 0,
 '3-gatsu no Lion 2nd Season': 0,
 'Kimi no Na wa.': 0,
 'Koe no Katachi': 0,
 'Gintama.': 0,
 'Clannad: After Story': 0,
 'Gintama': 0,
 'Gintama Movie 2: Kanketsu-hen - Yorozuya yo Eien Nare': 0,
 'Owarimonogatari 2nd Season': 0,
 'Code Geass: Hangyaku no Lelouch R2': 0,
 'Haikyuu!!: Karasuno Koukou vs. Shiratorizawa Gakuen Koukou': 0,
 'Mob Psycho 100 II': 0,
 'Sen to Chihiro no Kamikakushi': 0,
 'Gintama.: Shirogane no Tamashii-hen - Kouhan-sen': 0,
 'Kaguya-sama wa Kokurasetai?: Tensai-tachi no Renai Zunousen': 0,
 'Kizumonogatari III: Reiketsu-hen': 0,
 'Shouwa Genroku Rakugo Shinjuu: Sukeroku Futatabi-hen': 0,
 'Shigatsu wa Kimi no Uso': 0,
 'Cowboy Bebop': 0,
 'Gintama.: 

In [90]:
'Hunter x Hunter (2011)' in user_score_dicts_top_1000_anime[1]

True