In [7]:
from lxml import html
from json import dump,loads
from requests import get
import json
from re import sub
from dateutil import parser as dateparser
from time import sleep
import pandas as pd
import numpy as np

In [8]:
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [9]:
def get_home_info(url):
    home_info = {}
    new_url = "https://www.crunchyroll.com" + url
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko)' 
               'hrome/9.0.597.0 Safari/534.13'}
    response = get(new_url, headers = headers, verify=False, timeout=1000)
    if response.status_code == 404:
        return {"url": amazon_url, "error": "page not found"}
    cleaned_response = response.text.replace('\x00', '')
    home = html.fromstring(cleaned_response)
    
    home_info["similar"] = get_similar_anime(home)
    home_info["desc"] = get_show_desc(home)
    home_info["rating"] = get_ratings(home)
    home_info["tags"] = get_tags(home)
    home_info["num_eps"] = get_number_of_eps(home)
    return home_info

In [10]:
def get_parser(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko)' 
               'hrome/9.0.597.0 Safari/534.13'}
    response = get(url, headers = headers, verify=False, timeout=1000)
    if response.status_code == 404:
        return {"url": amazon_url, "error": "page not found"}
    
    cleaned_response = response.text.replace('\x00', '')
    parser = html.fromstring(cleaned_response)
    return parser

def get_similar_anime(home):
    home_recc = (home.xpath('.//div[@class="other-series large-margin-top"]')[0].
    xpath(".//ul[@class='portrait-grid cf']")[0].xpath(".//a[@token='shows-portraits']"))

    similar = [i.xpath('.//span[@class="img-holder"]')[0].xpath(".//img")[0].attrib["alt"] for i in home_recc]
    return similar

def get_show_desc(home):
    try:
        try:
            desc =  (home.xpath('.//ul[@id="sidebar_elements"]')[0].
             xpath(".//li[@class='large-margin-bottom']")[1].
             xpath(".//span[@class='more']//text()")[0])
        except:
            desc = (home.xpath('.//ul[@id="sidebar_elements"]')[0].
             xpath(".//li[@class='large-margin-bottom']")[1].
             xpath(".//span[@class='trunc-desc']//text()")[0])
    except:
        try:
            desc =  (home.xpath('.//ul[@id="sidebar_elements"]')[0].
             xpath(".//li[@class='large-margin-bottom']")[2].
             xpath(".//span[@class='more']//text()")[0])
        except:
            desc = (home.xpath('.//ul[@id="sidebar_elements"]')[0].
             xpath(".//li[@class='large-margin-bottom']")[2].
             xpath(".//span[@class='trunc-desc']//text()")[0])
    return desc

def get_ratings(home):    
    ratings = {}
    ratings["agg_review"] = (home.xpath('.//ul[@id="sidebar_elements"]')[0].
         xpath(".//li[@class='large-margin-bottom']")[-2].
         xpath(".//span[@id='showview_about_rate_widget']"))[0].attrib["content"]

    for i in range(1,6):
        lookup = ".//li[@class ='" + "{0}-star cf".format(i) + "']"
        try:
            count = (home.xpath('.//ul[@id="sidebar_elements"]')[0].
                 xpath(".//li[@class='large-margin-bottom']")[-2].
                 xpath(lookup)[0].
                 xpath(".//div[@class='left']//text()")[0]
            )
        except:
            count = 0
        ratings[i] = count
    return ratings

def get_tags(home):
    return [i.text_content() for i in (home.xpath('.//ul[@id="sidebar_elements"]')[0].
             xpath(".//li[@class='large-margin-bottom']")[-1].
             xpath(".//a[@class='text-link']"))]

def get_number_of_eps(home):
    return len((home.xpath('.//ul[@class="list-of-seasons cf"]')[0].
            xpath(".//li[@class='hover-bubble group-item']")))

def get_review_info(reviews_parser):
    reviews = []
    
    reviews_holder = reviews_parser.xpath(".//li[@class='white-wrapper container-shadow medium-margin-bottom']")
    
    ## Reviews/YES?
    if len(reviews_parser.xpath("div[@class='no_reviews_notification']")) != 0:
        return
    
    for review in reviews_holder:
        review_info = {}
        ##Found Useful
        review_info["useful"] = review.xpath(".//div[@class='helpful-count small-data xsmall-margin-bottom']//text()")

        ## Summary
        try:
            review_info["summary"] = review.xpath(".//div[@itemprop='summary']//text()")[0]
        except:
            review_info["summary"] = "No Summary"

        ## Review
        try:
            review_info["review"] = " ".join(test.xpath(".//div[@class='more']//text()"))
        except:
            review_info["review"] = " ".join(review.xpath(".//div[@class='trunc']//text()"))
        ## Rating
        try:
            review_info["rating"] = review.xpath(".//span[@class='rating-widget-static-large']")[0].attrib["content"]
        except:
            review_info["rating"] = "No Rating"
        ## datetime
        review_info["datetime"] = review.xpath(".//span[@itemprop='dtreviewed']")[0].text_content()
        
        reviews.append(review_info)
    return reviews

def get_reviews(url):    
    counter = 2
    reviews_parser = get_parser("https://www.crunchyroll.com{0}/reviews/helpful/page1".format(url))
    reviews = []
    val = get_review_info(reviews_parser)
    while len(val) != 0:
        reviews.append(val)
        reviews_parser = get_parser("https://www.crunchyroll.com{0}/reviews/helpful/page{1}".
                                   format(url,counter))
        #print("https://www.crunchyroll.com{0}/reviews/helpful/page{1}".
        #                           format(url,counter))
        val = get_review_info(reviews_parser)
        counter += 1
    return reviews

In [11]:
def get_crunchy_info():    
    crunchy_url = "https://www.crunchyroll.com/videos/anime/alpha?group=all"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko)'
               'Chrome/9.0.597.0 Safari/534.13'}
    response = get(crunchy_url, headers = headers, verify=False, timeout=1000)
    
    if response.status_code == 404:
        return {"url": amazon_url, "error": "page not found"}

    cleaned_response = response.text.replace('\x00', '')
    parser = html.fromstring(cleaned_response)

    urls = [i.attrib["href"] for i in parser.xpath('.//a[@token="shows-portraits"]')]
    #urls = urls[urls.index("/school-babysitters"):]
    shows = []
    for url in urls:
        #try:
            show_info = {}
            show_info["name"] = url
            show_info["reviews"] = get_reviews(url)
            print(url)
            show_info["home_info"] = get_home_info(url)
            shows.append(show_info)
        #except:
        #    return shows
    return shows

In [12]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
    crunchyroll = get_crunchy_info()

/07-ghost
/100-teacher-pascal
/11eyes
/5-centimeters-per-second
/91-days
/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi
/a-centaurs-life
/a-certain-magical-index
/a-dark-rabbit-has-seven-lives
/a-lull-in-the-sea-nagi-asu-nagi-no-asukara
/a-place-further-than-the-universe
/a-sisters-all-you-need
/a-talk-on-titan
/a-town-where-you-live
/a-wind-named-amnesia
/aachi-ssipak
/acca-13-territory-inspection-dept
/ace-attorney
/ace-of-the-diamond
/action-heroine-cheer-fruits
/active-raid
/actually-i-am
/ah-my-buddha
/aho-girl
/ai-mai-mi
/ai-mai-mi-mousou-catastrophe
/ai-mai-mi-surgical-friends
/air-master
/aiura
/akagi
/akame-ga-kill
/akashic-records-of-bastard-magic-instructor
/akb0048
/alderamin-on-the-sky
/aldnoahzero
/alice-zoroku
/all-out
/amagi-brilliant-park
/amanchu
/amnesia
/ange-vierge
/angels-3piece
/angels-of-death
/angolmois-record-of-mongol-invasion
/anima-yell
/anime-academy
/anime-crimes-division
/anime-de-training-ex
/anime-recap
/anime-gataris
/anisava
/anisong-station


/jin-roh
/jingai-san-no-yome
/jk-meshi
/jojos-bizarre-adventure
/joker
/joker-game
/jubei-chan-the-ninja-girl
/jungle-emperor-leo
/juni-taisenzodiac-war
/junji-ito-collection
/junjo-romantica
/kaasan-moms-life
/kado-the-right-answer
/kaede-new-town
/kagewani
/kaguya-sama-love-is-war
/kaiji
/kaiju-girls
/kakuriyo-bed-breakfast-for-spirits-
/kamigami-no-asobi
/kamisama-dolls
/kanamemo
/kancolle
/kanojo-ga-flag-wo-oraretara-if-her-flag-breaks
/karakai-jozu-no-takagi-san
/katana-maidens-mini-toji
/katana-maidens-toji-no-miko
/katsugeki-touken-ranbu
/keijo
/kemono-friends
/kenka-bancho-otome-girl-beats-boys-
/key-the-metal-idol
/kids-on-the-slope
/kiitaros-yokai-picture-diary
/kill-la-kill
/kimi-ni-todoke-from-me-to-you
/kings-game
/kinmoza
/kinos-journey-the-beautiful-world-the-animated-series
/kiss-him-not-me
/kite-liberator
/kiznaiver
/knights-magic
/knuspercast
/koi-koi-seven
/koihime-musou
/kokoro-connect
/komori-san-cant-decline
/kono-aozora-ni-yakusoku-wo
/konohana-kitan
/konosuba-go

/star-driver
/starlight-promises
/starmyu-season-2
/steinsgate-0
/stella-womens-academy-high-school-division-class-c3
/straight-title-robot-anime
/strange
/street-fighter-ii-the-animated-movie
/street-fighter-ii-the-animated-series
/strike-the-blood
/sunday-without-god
/super-gals
/super-robot-wars-original-generation
/sweet-blue-flowers
/sweetness-lightning
/sword-art-online
/sword-art-online-alternative-gun-gale-online
/symphogear
/tabimachi-lateshow
/taboo-tattoo
/taisho-mebiusline-chicchaisan
/takamiya-nasuno-desu
/tamayura-hitotose
/tanaka-kun-is-always-listless
/tantei-team-kz-jiken-note
/tari-tari
/teekyu
/tegami-bachi-letter-bee
/tenjho-tenge
/terraformars
/tesagure-bukatsumono
/that-time-i-got-reincarnated-as-a-slime
/the-ambition-of-oda-nobuna
/the-ancient-magus-bride
/the-asterisk-war
/the-comic-artist-and-his-assistants
/the-crunchyroll-anime-awards
/the-diary-of-ochibi-motion-manga
/the-dragon-dentist
/the-eccentric-family
/the-eccentric-family-2
/the-file-of-young-kindaic

In [13]:
import pickle

pickle.dump(crunchyroll, open("crunchy.pkl", "wb"))