In [1]:
import pandas as pd
import requests
from datetime import datetime
import json
import os
from dateutil import parser
import collections

The following code helps to determine whether new data is inserted in api database or not, if so then we have to pull new movies data and put that in database and update tracking parameters. 

In [2]:
last_page =  requests.get('https://yts.torrentbay.to/api/v2/list_movies.json?page=821&limit=50').json()
first_page = requests.get('https://yts.torrentbay.to/api/v2/list_movies.json?page=1&limit=50').json()

In [3]:
movie_a = last_page['data']['movies'][1]

In [4]:
movie_a

{'id': 1393,
 'url': 'https://yts.torrentbay.to/movies/hamlet-1990',
 'imdb_code': 'tt0099726',
 'title': 'Hamlet',
 'title_english': 'Hamlet',
 'title_long': 'Hamlet (1990)',
 'slug': 'hamlet-1990',
 'year': 1990,
 'rating': 6.7,
 'runtime': 135,
 'genres': ['Action', 'Drama'],
 'summary': 'Hamlet&#39;s father, the King of Denmark was murdered by Hamlet&#39;s uncle, Claudius. Gertrude (Hamlet&#39;s mother) married Claudius shortly after her husband, the king&#39;s death. The ghost of Hamlet&#39;s father comes back and tells Hamlet to kill his uncle for murdering him and to leave his mother alone because she is innocent. Hamlet is dating Ophelia until her father, Polonius persuades her to end their relationship.',
 'description_full': 'Hamlet&#39;s father, the King of Denmark was murdered by Hamlet&#39;s uncle, Claudius. Gertrude (Hamlet&#39;s mother) married Claudius shortly after her husband, the king&#39;s death. The ghost of Hamlet&#39;s father comes back and tells Hamlet to kill h

In [None]:
movie_a['genres']

In [None]:
ids = [41763,41713]
ids

In [None]:
def check_new_movies(ids:list):

    page_number = 1
    movies_list = []
    while True:
        init_page = requests.get(f'https://yts.torrentbay.to/api/v2/list_movies.json?page={page_number}&limit=50').json()

        current_ids = [init_page['data']['movies'][0]['id'],init_page['data']['movies'][-1]['id']]

        if current_ids > ids:
            for movie in init_page['data']['movies']:
                movies_list.append(movie['title'])
        
            page_number = page_number + 1

        else:
            break

    return movies_list

In [None]:
mov_list = check_new_movies(ids)

In [None]:
for movie in first_page['data']['movies']:
    print(movie)

Below code is for extracting movie cast based on "IMDB code"

In [18]:
import requests
from bs4 import BeautifulSoup
import re
import time

## method to get cast list of given imdb movie id

In [19]:
def get_cast(imdb_id : str):
    
    url = f'https://www.imdb.com/title/{imdb_id}/fullcredits'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table', attrs={'class' : 'cast_list'})
    cast = table.find_all('a')   
    
    return re.findall(r'title="(.*?)"', str(cast))

###  The following data needed to be pulled from api response and put in warehouse
#### : imdb_code -> unique movie code given by IMDB
#### : title_long -> full movie title with year
#### : year -> released year
#### : rating -> movie rating (1-10)
#### : runtime -> in minutes
#### : genres -> can be one or more
#### : summary -> summary of movie
#### : cast -> need to scrape from IMDB
#### : mpa_rating -> movie sensorship rating
#### : language -> can be one or more

In [6]:
movie_a['imdb_code'], movie_a['title_long'], movie_a['year'], movie_a['rating'], movie_a['runtime'], (movie_a['genres']), movie_a['mpa_rating'], movie_a['language']

('tt0082477',
 "Gregory's Girl (1980)",
 1980,
 7.1,
 91,
 ['Action', 'Comedy', 'Drama', 'Romance'],
 '',
 'en')

Here is the main problem, as movie has multiple genres, we need to figure out how to insert genres without repetation.
First we need to insert "imdb_code" and "movie_title" in movies table. Then we have to check that genre exists in genre table, if so then we can skip, else insert it.

In [5]:
import pyodbc

In [6]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DESKTOP-K8VV6KV;'
                      'Database=yts_warehouse;'
                      'Trusted_Connection=yes;')

In [7]:
cursor = conn.cursor()

### inserting into movie table

In [51]:
cursor.execute("""
insert into movies (imdb_id, title, year, rating, runtime, mpa_rating, language, date_uploaded)
VALUES (?,?,?,?,?,?,?,?)
""",
movie_a['imdb_code'], movie_a['title_long'], movie_a['year'],movie_a['rating'], movie_a['runtime'], movie_a['mpa_rating'], movie_a['language'], movie_a['date_uploaded']              
              )

<pyodbc.Cursor at 0x24abc82bab0>

### inserting into genre table

In [21]:
for genre in movie_b['genres']:
    cursor.execute("""
            if not exists (
            select * from genre
            where genre_title = ?
            )
            begin
                insert into genre values(?)
            end      
    """, genre, genre)

In [22]:
cursor.execute('select *from genre')

<pyodbc.Cursor at 0x202535deab0>

In [23]:
for i in cursor:
    print(i)

(1, 'Action')
(2, 'Drama')
(3, 'Adventure')
(4, 'History')
(5, 'Horror')


In [14]:
movie_a['genres']

['Action', 'Drama']

In [18]:
movie_b = last_page['data']['movies'][0]

In [20]:
movie_b['genres']

['Action', 'Adventure', 'History', 'Horror']