In [1]:
import pickle
import numpy as np
import pandas as pd
import json
import sqlalchemy as db
from sqlalchemy import create_engine
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from io import StringIO
from urllib.parse import quote
from concurrent.futures import ThreadPoolExecutor

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@localhost:5432/animeplanet"
db = create_engine(db_string)

In [4]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [5]:
with db.connect() as con:
    urls = pd.read_sql('SELECT url FROM web_scrape;', con)['url']

### Extracting usernames

In [6]:
usernames = urls.str.extract(r'/users/(?P<username>[^/]*)')['username']

In [7]:
usernames = usernames.loc[usernames.notnull()].unique()

In [8]:
usernames

array(['Minahirion', 'Abii', 'MUn10n', ..., 'ShadowStormtrooper', 'Turba',
       'kittykawa'], dtype=object)

### Scrape User Anime Lists

In [9]:
def scrapeTable(url, attempt=1):
    if attempt == 4:
        return pd.DataFrame(columns=['title', 'type', 'year', 'avg', 'status', 
                                     'eps', 'times_watched', 'rating'])
    
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    if resp.text != '':
        try:
            soup = BeautifulSoup(resp.text, 'html.parser')
            table = soup.find('table')
            chunk = pd.read_html(StringIO(str(table)))[0]
            chunk.columns = ['title', 'type', 'year', 'avg', 'status', 'eps', 'times_watched', 'rating']
            chunk['url'] = [np.where(tag.has_attr('href'), 
                               'https://www.anime-planet.com' + tag.get('href'), 
                               'no link') for tag in [td.find('a') for td in table.find_all('td', attrs={'class':'tableTitle'})]]
            chunk['url'] = chunk['url'].astype('string')
            return chunk
        except:
            return scrapeTable(url, attempt+1)
    else:
        return scrapeTable(url, attempt+1)

In [10]:
def scrapeUserWatched(username, attempt=1):
    if attempt == 4:
        return pd.DataFrame(columns=['title', 'type', 'year', 'avg', 'status', 
                                     'eps', 'times_watched', 'rating', 'username'])
    
    url = f'https://www.anime-planet.com/users/{username}/anime/watched?sort=title&mylist_view=list'
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    if resp.text != '':
        try:
            soup = BeautifulSoup(resp.text, 'html.parser')
            ul = soup.find('ul', attrs={'class':'nav'})
            page_nums = []
            for tag in ul.find_all('a'):
                try:
                    page_nums.append(int(tag.text))
                except:
                    continue
            num_pages = max(page_nums)

            urls = [f'{url}&page={i}' for i in range(1, num_pages+1)]

            with ThreadPoolExecutor(max_workers=10) as executor:
                df_list = list(executor.map(scrapeTable, urls))

            df = pd.concat(df_list)

            df['username'] = username
            
            with db.connect() as con:
                df.to_sql('user', con, index=False, if_exists='append', method='multi')
            
        except:
            scrapeUserWatched(username, attempt+1)
        
    else:
        scrapeUserWatched(username, attempt+1)

In [10]:
for username in tqdm(usernames):
    scrapeUserWatched(username)

  0%|          | 20/204498 [01:53<322:05:40,  5.67s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-91f7ff8e9b95>", line 2, in <module>
    scrapeUserWatched(username)
  File "<ipython-input-10-a1752916605c>", line 33, in scrapeUserWatched
    scrapeUserWatched(username, attempt+1)
  File "<ipython-input-10-a1752916605c>", line 7, in scrapeUserWatched
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.9/site-packages/requests/api.py", line 76, in get
    return request('get', url, params=params, **kwargs)
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.9/site-packages/requests/api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  Fi

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-91f7ff8e9b95>", line 2, in <module>
    scrapeUserWatched(username)
  File "<ipython-input-10-a1752916605c>", line 33, in scrapeUserWatched
    scrapeUserWatched(username, attempt+1)
  File "<ipython-input-10-a1752916605c>", line 7, in scrapeUserWatched
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.9/site-packages/requests/api.py", line 76, in get
    return request('get', url, params=params, **kwargs)
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.9/site-packages/requests/api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  Fi

TypeError: object of type 'NoneType' has no len()