In [50]:
# %%writefile imdb_spider.py
import requests,re,os,sqlite3,sys,time,json
from bs4 import BeautifulSoup
import pandas as pd
from imdb_config import *
from urllib.parse import urljoin
sys.path.append('/home/ubuntu/pyproject/scrapy_toolv2')
from html_downloader import html_downloader

class imdb_spider():
    def __init__(self,dbpath=FILEPATH_DATABASE, hd=None):
        self.conn = sqlite3.connect(dbpath)
        self.cur = self.conn.cursor()
        if hd:
            self.hd = hd
        else:
            self.hd = html_downloader(world=True)
        self.filmlist_used_ttid()
        self.used_url_li_tt = self.used_url_gen(FILEPATH_USEDURL_LI_TT)
    
    def used_url_gen(self, filename):
        with open(filename, 'r') as f:
            tempstr = f.read()
        return list(tempstr.split(','))
    
    def used_url_add(self, url, used_list, filename):
        with open(filename, 'a+') as f:
            f.write('{},'.format(url))
        used_list.append(url)
        print('SUCCESS: scrape {}'.format(url))
        
    def filmlist_used_ttid(self):
        try:
            tempdf = pd.read_sql('select ttid from {}'.format(TABLENAME_FILMLIST), self.conn)
            self.used_filmlist = set(tempdf['ttid'].values.tolist())
        except:
            self.used_filmlist = set()
        
    def parse_li_tt(self, res_content):
        list_film_id = []
        soup = BeautifulSoup(res_content, 'html.parser')
        item_content = soup.find_all('div', {'class':'lister-item-content'})
        for i in item_content:
            item_header = i.find('h3', {'class':'lister-item-header'})
            film_url = item_header.a['href']
            film_name = item_header.a.get_text()
            item_genre = i.find('span', {'class':'genre'})
            film_genre = item_genre.get_text()
            film_year = i.find('span', {'class':'lister-item-year'}).get_text()
        # modify
            s_film_id = re.search(r'tt\d+', film_url)
            film_id = s_film_id.group()
            film_genre = re.sub(r'\s','',film_genre)
            film_year = re.sub(r'\D', '', film_year)
        # save
            temp_dict = {'ttid':film_id, 'name':film_name, 'year':film_year, 'genre':film_genre, 'url':film_url}
            list_film_id.append(temp_dict)
        
        # next page
        item_next_page = soup.find('a', {'class':'lister-page-next'})
        if item_next_page:
            np_url = item_next_page['href']
        else:
            np_url = False
        return list_film_id,np_url
    def save_li_tt(self, list_film_id, to_db=True, table_name=TABLENAME_FILMLIST):
        dict_for_pandas = {'ttid':[], 'name':[], 'year':[], 'genre':[], 'url':[]}
        for i in list_film_id:
            if i['ttid'] in self.used_filmlist:
                continue
            for k in dict_for_pandas.keys():
                dict_for_pandas[k].append(i.get(k))
            self.used_filmlist.add(i['ttid'])
        df_list_film = pd.DataFrame(dict_for_pandas)
        
        if to_db:
            df_list_film.to_sql(name=table_name,con=self.conn,if_exists='append',index=False)
        return df_list_film
    def scrapy_li_tt(self, genre_url,teststop=-1):
        if teststop ==0:
            print('test end')
            return
        if teststop > 0:
            teststop = teststop-1
        if genre_url in self.used_url_li_tt:
            genre_url = self.used_url_li_tt[-2]
            print('ATTENTION: start from {}'.format(genre_url))
        response = self.hd.request_proxy(genre_url)
        if response:
            list_film_id,np_url = self.parse_li_tt(response.content)
            df_list_film = self.save_li_tt(list_film_id)
            df_list_film.to_csv(os.path.join(PATH_FILMLIST_TEMP,'{}.csv'.format(int(time.time()))), index=False)
            self.used_url_add(genre_url,self.used_url_li_tt,FILEPATH_USEDURL_LI_TT)
            if np_url:
                np_url = urljoin(domain_url, np_url)
                time.sleep(1)
                self.scrapy_li_tt(np_url, teststop=teststop)
        else:
            print('ERROR: scrapy interrupt!!!')
    
    def scrapy_li_tt_all(self):
        for i in genre_url_list:
            self.scrapy_li_tt(i)
    
    def parse_title(self, res_content):
        soup = BeautifulSoup(res_content, 'html.parser')
        script_data = soup.find('script', {'type':'application/ld+json'})
        data_dict = self.parse_title_json(script_data.get_text())
        return data_dict
    def parse_title_json(self, json_str):
        tmp_dict = json.loads(json_str)
        return tmp_dict
        

            
# if __name__ == "__main__":
#     sp = imdb_spider()
#     sp.scrapy_li_tt_all()

In [23]:
hd = html_downloader(world=True)

In [10]:
url = genre_url_list[0]
t1 = imdb_spider(dbpath='test.db',hd=hd)
t1.scrapy_li_tt(url,teststop=2)

ATTENTION: start from https://www.imdb.com/search/title/?title_type=feature&genres=action&start=451&explore=genres&ref_=adv_nxt
SUCCESS: scrape https://www.imdb.com/search/title/?title_type=feature&genres=action&start=451&explore=genres&ref_=adv_nxt
SUCCESS: scrape https://www.imdb.com/search/title/?title_type=feature&genres=action&start=501&explore=genres&ref_=adv_nxt
test end


In [None]:
print(len(t1.hd.ip_pool))

In [2]:
conn = sqlite3.connect(FILEPATH_DATABASE)
cur = conn.cursor()

In [26]:
df1 = pd.read_sql('select * from {}'.format(TABLENAME_FILMLIST), con = conn)
print(df1.shape)

(47785, 5)


In [22]:
df1[df1.values=='Mahiwagang kris']

Unnamed: 0,ttid,name,year,genre,url
37154,tt0487699,Mahiwagang kris,1975,Action,/title/tt0487699/?ref_=adv_li_tt


In [24]:
title_url = 'https://www.imdb.com/title/tt0120737/?ref_=adv_li_tt'
r2 = hd.request_proxy(title_url)

In [25]:
r2.status_code

200

In [51]:
t2 = imdb_spider(dbpath='test.db',hd=hd)
d2 = t2.parse_title(r2.content)

In [56]:
for k,v in d2.items():
    print(k,":",v)

@context : http://schema.org
@type : Movie
url : /title/tt0120737/
name : The Lord of the Rings: The Fellowship of the Ring
image : https://m.media-amazon.com/images/M/MV5BN2EyZjM3NzUtNWUzMi00MTgxLWI0NTctMzY4M2VlOTdjZWRiXkEyXkFqcGdeQXVyNDUzOTQ5MjY@._V1_.jpg
genre : ['Action', 'Adventure', 'Drama', 'Fantasy']
contentRating : PG-13
actor : [{'@type': 'Person', 'url': '/name/nm0000704/', 'name': 'Elijah Wood'}, {'@type': 'Person', 'url': '/name/nm0005212/', 'name': 'Ian McKellen'}, {'@type': 'Person', 'url': '/name/nm0089217/', 'name': 'Orlando Bloom'}, {'@type': 'Person', 'url': '/name/nm0000293/', 'name': 'Sean Bean'}]
director : {'@type': 'Person', 'url': '/name/nm0001392/', 'name': 'Peter Jackson'}
creator : [{'@type': 'Person', 'url': '/name/nm0866058/', 'name': 'J.R.R. Tolkien'}, {'@type': 'Person', 'url': '/name/nm0909638/', 'name': 'Fran Walsh'}, {'@type': 'Person', 'url': '/name/nm0101991/', 'name': 'Philippa Boyens'}, {'@type': 'Person', 'url': '/name/nm0001392/', 'name': 'Peter