# Recommender system

Carlos Pinto Pérez
## Scraping

In [None]:
# Install jikanpy
!pip install --user git+git://github.com/AWConant/jikanpy.git

In [2]:
from jikanpy import Jikan
import pandas as pd
import numpy as np
import time
import datetime as dt

j = Jikan()

Scrap a manga list using rankings

In [2]:
top_manga_ids, top_manga_names = [], []
start_time, pag_100_time = time.time(), time.time()
page = 1
size = 20000
while len(top_manga_ids) <= size:
    current_page = j.top(type='manga', page=page)['top']
    for manga in current_page:
        try:
            top_manga_ids.append(manga['mal_id'])
            top_manga_names.append(manga['title'])
        except:
            None
    page = page+1
    if page % 100 == 0:
        prev_pag_100_time = pag_100_time
        pag_100_time = time.time()
        pag_100_time_format = str(dt.timedelta(seconds=int(pag_100_time-prev_pag_100_time)))
        total_time_format = str(dt.timedelta(seconds=int(pag_100_time-start_time)))
        print(f'Reached page {page}, manga count: {len(top_manga_ids)}.\tTime: {pag_100_time_format}. Total time: {total_time_format}')
end_time = time.time()
end_time_format = str(dt.timedelta(seconds=int(end_time-start_time)))
print(f'Total time: {end_time_format}')

df_mangas = pd.DataFrame(data={'manga_id': top_manga_ids, 'manga_name': top_manga_names}).iloc[0:size,:]
df_mangas['manga_rank'] = df_mangas.index +1
df_mangas.to_csv('mangas.csv', index=False)
print(f'Ended at {time.asctime(time.localtime(end_time))}.')
df_mangas

Reached page 100, manga count: 4950.	Time: 0:01:14. Total time: 0:01:14
Reached page 200, manga count: 9950.	Time: 0:01:15. Total time: 0:02:29
Reached page 300, manga count: 14950.	Time: 0:01:21. Total time: 0:03:51
Reached page 400, manga count: 19950.	Time: 0:01:14. Total time: 0:05:06
Total time: 0:05:08
Ended at Wed Apr 10 18:44:19 2019.


Unnamed: 0,manga_id,manga_name,manga_rank
0,2,Berserk,1
1,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,2
2,25,Fullmetal Alchemist,3
3,13,One Piece,4
4,1,Monster,5
5,4632,Oyasumi Punpun,6
6,70345,Grand Blue,7
7,656,Vagabond,8
8,16765,Kingdom,9
9,51,Slam Dunk,10


In [3]:
df_mangas = pd.read_csv('mangas.csv')
df_mangas.head()

Unnamed: 0,manga_id,manga_name,manga_rank
0,2,Berserk,1
1,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,2
2,25,Fullmetal Alchemist,3
3,13,One Piece,4
4,1,Monster,5


Scrap users and review scores of top 20.000 mangas. This is an intermediate step: review scores are valuable but there are a little amount of them. By doing this step I collect information of prolific users.

In [3]:
users_review_scores = []
start_time, each_100_mangas_time = time.time(), time.time()
review_count, keystone_gap = 0, 500
keystone = keystone_gap
for manga_id in df_mangas['manga_id']:
    current_page = 1
    try:
        current_reviews = j.manga(manga_id, extension='reviews', page=1)['reviews']
    except:
        print(f'Problem at reviews of manga {manga_id}.')
        current_reviews = []
    while len(current_reviews) > 0:
        for review in current_reviews:
            review_count = review_count+1
            user = review['reviewer']['username']
            score = review['reviewer']['scores']['overall']
            weight = review['helpful_count']
            data = {'user': user, 'manga_id': manga_id, 'score': score, 'weight': weight}
            users_review_scores.append(data)
        current_page = current_page+1
        try:
            current_reviews = j.manga(manga_id, extension='reviews', page=current_page)['reviews']
        except:
            print(f'Problem at reviews of manga {manga_id}, page {current_page}.')
            current_reviews = []
    if review_count > keystone:
        ant_each_100_mangas_time = each_100_mangas_time
        each_100_mangas_time = time.time()
        each_100_mangas_time_format = str(dt.timedelta(seconds=int(each_100_mangas_time-ant_each_100_mangas_time)))
        total_time_format = str(dt.timedelta(seconds=int(each_100_mangas_time-start_time)))
        print(f'Reached {keystone} reviews.\tTime: {each_100_mangas_time_format}. Total time: {total_time_format}')
        keystone = keystone + keystone_gap
end_time = time.time()
end_time_format = str(dt.timedelta(seconds=int(end_time-start_time)))
print(f'Total time: {end_time_format}')

df_reviews_scores = pd.DataFrame(data=users_review_scores)
df_reviews_scores.to_csv('reviews_scores.csv', index=False)
print(f'Ended at {time.asctime(time.localtime(end_time))}.')
df_reviews_scores

Reached 500 reviews.	Time: 0:01:26. Total time: 0:01:26
Reached 1000 reviews.	Time: 0:01:24. Total time: 0:02:50
Reached 1500 reviews.	Time: 0:01:50. Total time: 0:04:41
Reached 2000 reviews.	Time: 0:01:40. Total time: 0:06:21
Reached 2500 reviews.	Time: 0:01:23. Total time: 0:07:44
Reached 3000 reviews.	Time: 0:01:49. Total time: 0:09:34
Reached 3500 reviews.	Time: 0:02:24. Total time: 0:11:59
Reached 4000 reviews.	Time: 0:02:11. Total time: 0:14:10
Reached 4500 reviews.	Time: 0:02:56. Total time: 0:17:06
Reached 5000 reviews.	Time: 0:03:26. Total time: 0:20:33
Reached 5500 reviews.	Time: 0:03:01. Total time: 0:23:34
Reached 6000 reviews.	Time: 0:03:37. Total time: 0:27:11
Reached 6500 reviews.	Time: 0:03:48. Total time: 0:31:00
Reached 7000 reviews.	Time: 0:03:31. Total time: 0:34:32
Reached 7500 reviews.	Time: 0:02:54. Total time: 0:37:26
Reached 8000 reviews.	Time: 0:03:53. Total time: 0:41:19
Reached 8500 reviews.	Time: 0:04:36. Total time: 0:45:56
Reached 9000 reviews.	Time: 0:04

Unnamed: 0,manga_id,score,user,weight
0,2,10,TheCriticsClub,1315
1,2,7,Polyphemus,771
2,2,10,qrdel,704
3,2,10,Cobbles,322
4,2,10,Aja,127
5,2,6,Tumerking,115
6,2,7,aindah,48
7,2,9,infinity,43
8,2,3,hardcase,29
9,2,10,IronBerserk,40


In [4]:
df_reviews_scores = pd.read_csv('reviews_scores.csv')
df_reviews_scores.head()

Unnamed: 0,manga_id,score,user,weight
0,2,10,TheCriticsClub,1315
1,2,7,Polyphemus,771
2,2,10,qrdel,704
3,2,10,Cobbles,322
4,2,10,Aja,127


Scrap the manga scores of the last users collected. Manga scores are the key of the recommender system.

Attention: It seems that the default score of a non-rated manga is 0. So I only collect scores if they are greater than 0.

In [5]:
users_checked, scores, bad_users = [], [], []
start_time, each_keystone_time = time.time(), time.time()
scores_count, keystone_gap = 0, 20000
keystone = keystone_gap
for user in df_reviews_scores['user']:
    if user not in users_checked and user not in bad_users:
        current_page = 1
        try:
            current_mangas = j.user(username=user, request='mangalist', argument='all', page=current_page)['manga']
        except:
            bad_users.append(user)
            current_mangas = []
        if user not in bad_users:
            users_checked.append(user)
        while len(current_mangas) > 0:
            for manga in current_mangas:
                manga_id = manga['mal_id']
                score = manga['score']
                if manga_id in df_mangas['manga_id'] and score > 0:
                    scores_count = scores_count+1
                    data = {'user': user, 'manga_id': manga_id, 'score': score}
                    scores.append(data)
            current_page = current_page+1
            try:
                current_mangas = j.user(username=user, request='mangalist', argument='all', page=current_page)['manga']
            except:
                print(f'Problem viewing mangas of {user}, page {current_page}.')
                print('User actually saved in unique_users.csv because a previous page has given information.')
                current_mangas = []
        if scores_count > keystone:
            ant_each_keystone_time = each_keystone_time
            each_keystone_time = time.time()
            each_keystone_time_format = str(dt.timedelta(seconds=int(each_keystone_time-ant_each_keystone_time)))
            total_time_format = str(dt.timedelta(seconds=int(each_keystone_time-start_time)))
            print(f'Reached {keystone//1000}k scores through {len(users_checked)} users.' +
                  f'\tTime: {each_keystone_time_format}. Total time: {total_time_format}')
            keystone = keystone + keystone_gap
end_time = time.time()
end_time_format = str(dt.timedelta(seconds=int(end_time-start_time)))
print(f'Total time: {end_time_format}')

df_scores = pd.DataFrame(data=scores)
df_scores.to_csv('scores.csv', index=False)

df_validated_users = pd.DataFrame(data=users_checked)
df_validated_users.to_csv('good_users.csv', index=False)

df_bad_users = pd.DataFrame(data=bad_users)
df_bad_users.to_csv('bad_users.csv', index=False)

print(f'Ended at {time.asctime(time.localtime(end_time))}.')
df_scores

Reached 20k scores through 470 users.	Time: 0:19:24. Total time: 0:19:24
Reached 40k scores through 893 users.	Time: 0:20:57. Total time: 0:40:21
Reached 60k scores through 1211 users.	Time: 0:14:47. Total time: 0:55:08
Reached 80k scores through 1623 users.	Time: 0:16:58. Total time: 1:12:07
Reached 100k scores through 2057 users.	Time: 0:18:18. Total time: 1:30:25
Reached 120k scores through 2316 users.	Time: 0:12:38. Total time: 1:43:03
Reached 140k scores through 2601 users.	Time: 0:13:47. Total time: 1:56:51
Reached 160k scores through 2996 users.	Time: 0:16:40. Total time: 2:13:31
Reached 180k scores through 3353 users.	Time: 0:15:12. Total time: 2:28:44
Reached 200k scores through 3678 users.	Time: 0:15:00. Total time: 2:43:44
Reached 220k scores through 4029 users.	Time: 0:14:30. Total time: 2:58:15
Reached 240k scores through 4289 users.	Time: 0:13:19. Total time: 3:11:35
Reached 260k scores through 4683 users.	Time: 0:16:50. Total time: 3:28:26
Reached 280k scores through 504

Unnamed: 0,manga_id,score,user
0,2,10,TheCriticsClub
1,4,10,TheCriticsClub
2,1067,4,Polyphemus
3,13,8,Polyphemus
4,682,2,Polyphemus
5,38,5,Polyphemus
6,17645,3,Polyphemus
7,1402,3,Polyphemus
8,5280,2,Polyphemus
9,11133,3,Polyphemus


---