In [33]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
from imdb import IMDbDataAccessError
from bs4 import BeautifulSoup
from SN_help import get_movie_from_imdb, get_movie_keywords, get_movie_comment, get_person_from_imdb, build_bipartite
import igraph as ig
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [1]:
movie_path = './netflix/movie.parquet'
actor_path = './netflix/actor.parquet'
link_path = './netflix/movie_actor.parquet'
save_path = './netflix/network'
tag = 'netflix'
_, _, mg, ag = build_bipartite(movie_path, actor_path, link_path)

[31m>>>>[0m Adding 4129 movies and 6021 actors to network.
[31m>>>>[0m Adding 9239 movie-actor links to network.
[31m>>>>[0m Projecting bipartite nework to movie nodes and actor nodes.
[31m>>>>[0m Cleanning useless attributes for Movie Network.
		['bipartite', 'birthYear', 'deathYear', 'nconst', 'primaryName']
[31m>>>>[0m Cleanning useless attributes for Actor Network.
		['bipartite', 'description', 'genres', 'averageRating', 'numVotes', 'primaryTitle', 'runtimeMinutes', 'startYear', 'tconst']
[31m>>>>[0m Getting the GCC of Movie Network and Actor Network.
		Movie: (N4129, L9076)-->(N1923, L8346)
		Actor: (N6021, L8686)-->(N2392, L4676)
[31m>>>>[0m Summary:
IGRAPH U-W- 4129 9076 -- 
+ attr: averageRating (v), country (v), description (v), genres (v), numVotes (v), primaryTitle (v), runtimeMinutes (v), startYear (v), tconst (v), weight (e)
IGRAPH U-W- 6021 8686 -- 
+ attr: birthYear (v), country (v), deathYear (v), nconst (v), primaryName (v), weight (e)
[31m>>>>[0m Done

In [35]:
tconst_list = mg.vs['tconst']

m_attr = pd.DataFrame(columns=['cast', 'budget', 'boxOffice', 'plotOutline', 'plot', 'synopsis'])
m_keyword = pd.DataFrame(columns=['keywords'])
m_review = pd.DataFrame(columns=['reviews'])

### 下载电影数据

In [9]:
i = 0
for tconst in tqdm(tconst_list[i:]):
    try:
        m_attr.loc[tconst] = pd.Series(get_movie_from_imdb(tconst))
    except IMDbDataAccessError:
        m_attr.loc[tconst] = pd.Series(get_movie_from_imdb(tconst))
        continue

  return asarray(a).ndim
100%|██████████| 965/965 [43:10<00:00,  2.68s/it]


In [10]:
m_attr.to_csv('./netflix/download/movie.attr.csv', index=True)

### 下载电影关键字

In [31]:
i = 0 # Discontinuous transmission LOL
for tconst in tqdm(tconst_list[i:]):
    try:
        m_keyword.loc[tconst, 'keywords'] = get_movie_keywords(tconst)
    except IMDbDataAccessError:
        m_keyword.loc[tconst, 'keywords'] = get_movie_keywords(tconst)
        continue

  arr_value = np.asarray(value)
100%|██████████| 1923/1923 [34:14<00:00,  1.07s/it]


In [32]:
m_keyword.to_csv('./netflix/download/movie.keyword.csv', index=True)

### 下载电影评论

In [36]:
i = 0
error_tconst = []
for tconst in tqdm(tconst_list[i:]):
    try:
        m_review.loc[tconst, 'reviews'] = get_movie_comment(tconst)
    except:
        error_tconst.append(tconst)
        m_review.loc[tconst, 'reviews'] = get_movie_comment(tconst)
        continue
error_tconst

100%|██████████| 1923/1923 [2:38:46<00:00,  4.95s/it]      


['tt3137630']

In [39]:
m_review.loc[tconst, 'reviews'] = get_movie_comment('tt3137630')
display(m_review.loc['tt3137630'])

m_review.to_csv('./netflix/download/movie.review.csv', index=True)

reviews    [I'm a big fan of Ricky Gervais' work and cons...
Name: tt3137630, dtype: object

### 下载演员

In [40]:
a_attr = pd.DataFrame(columns=['name', 'height', 'birthday', 'country', 'bigoraphy', 'trivia'])
nconst_list = ag.vs['nconst']

In [49]:
i = 0
error_nconst = []
for nconst in tqdm(nconst_list[i:]):
    try:
        a_attr.loc[nconst] = get_person_from_imdb(nconst)
    except IMDbDataAccessError:
        error_nconst.append(nconst)
        a_attr.loc[nconst] = get_person_from_imdb(nconst)
        continue
error_nconst

 31%|███       | 741/2392 [55:53<1:53:08,  4.11s/it] 2022-12-04 21:21:00,977 CRITICAL [imdbpy] /Users/baixianger/opt/miniconda3/envs/sn/lib/python3.8/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/name/nm0117412/bio', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/baixianger/opt/miniconda3/envs/sn/lib/python3.8/site-packages/imdb/parser/http/__init__.py", line 221, in retrieve_unicode
    response = uopener.open(url)
  File "/Users/baixianger/opt/miniconda3/envs/sn/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/Users/baixianger/opt/miniconda3/envs/sn/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/Users/baixianger/opt/miniconda3/envs/sn/

['nm0117412', 'nm0672667', 'nm0000513']

In [54]:
a_attr.to_csv('./netflix/download/actor.attr.csv', index=True)