In [1]:
import pandas as pd
import requests
from datetime import datetime
import json
import os
from dateutil import parser
import collections

The following code helps to determine whether new data is inserted in api database or not, if so then we have to pull new movies data and put that in database and update tracking parameters. 

In [2]:
last_page =  requests.get('https://yts.torrentbay.to/api/v2/list_movies.json?page=821&limit=50').json()
# first_page = requests.get('https://yts.torrentbay.to/api/v2/list_movies.json?page=1&limit=50').json()

In [24]:
movie_a = last_page['data']['movies'][2]

In [25]:
movie_a['imdb_code']

'tt0082509'

In [5]:
movie_a['genres']

['Action', 'Adventure', 'Comedy', 'Drama']

In [None]:
ids = [41763,41713]
ids

In [None]:
def check_new_movies(ids:list):

    page_number = 1
    movies_list = []
    while True:
        init_page = requests.get(f'https://yts.torrentbay.to/api/v2/list_movies.json?page={page_number}&limit=50').json()

        current_ids = [init_page['data']['movies'][0]['id'],init_page['data']['movies'][-1]['id']]

        if current_ids > ids:
            for movie in init_page['data']['movies']:
                movies_list.append(movie['title'])
        
            page_number = page_number + 1

        else:
            break

    return movies_list

In [None]:
mov_list = check_new_movies(ids)

In [None]:
for movie in first_page['data']['movies']:
    print(movie)

Below code is for extracting movie cast based on "IMDB code"

In [17]:
import requests
from bs4 import BeautifulSoup
import re
import time

## method to get cast list of given imdb movie id

In [18]:
def get_cast(imdb_id : str):
    
    url = f'https://www.imdb.com/title/{imdb_id}/fullcredits'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table', attrs={'class' : 'cast_list'})
    cast = table.find_all('a')   
    
    return re.findall(r'title="(.*?)"', str(cast))

###  The following data needed to be pulled from api response and put in warehouse
#### : imdb_code -> unique movie code given by IMDB
#### : title_long -> full movie title with year
#### : year -> released year
#### : rating -> movie rating (1-10)
#### : runtime -> in minutes
#### : genres -> can be one or more
#### : summary -> summary of movie
#### : cast -> need to scrape from IMDB
#### : mpa_rating -> movie sensorship rating
#### : language -> can be one or more

In [18]:
movie_a['imdb_code'], movie_a['title_long'], movie_a['year'], movie_a['rating'], movie_a['runtime'], (movie_a['genres']), movie_a['mpa_rating'], movie_a['language']

('tt1289406',
 'Harry Brown (2009)',
 2009,
 7.2,
 103,
 ['Action', 'Crime', 'Drama', 'Thriller'],
 'R',
 'en')

Here is the main problem, as movie has multiple genres, we need to figure out how to insert genres without repetation.
First we need to insert "imdb_code" and "movie_title" in movies table. Then we have to check that genre exists in genre table, if so then we can skip, else insert it.

In [6]:
import pyodbc

In [7]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DESKTOP-K8VV6KV;'
                      'Database=yts_warehouse;'
                      'Trusted_Connection=yes;'
                       'autocommit=True')

In [8]:
cursor = conn.cursor()

In [9]:
cursor.execute("""
use yts_warehouse
""");

In [10]:
cursor.execute("""
DBCC CHECKIDENT ('[genre]', RESEED, 0);
""");

### inserting into movie table

In [26]:
cursor.execute("""
insert into movies (imdb_id, title, year, rating, runtime, mpa_rating, language, date_uploaded)
VALUES (?,?,?,?,?,?,?,?)
""",
movie_a['imdb_code'], movie_a['title_long'], movie_a['year'],movie_a['rating'], movie_a['runtime'], movie_a['mpa_rating'], movie_a['language'], movie_a['date_uploaded']              
              );

In [27]:
cursor.execute('select * from movies')
for i in cursor:
    print(i)

('tt0082509', 'Heavy Metal (1981)', 1981, 6.6, 86.0, '', 'en', datetime.datetime(2015, 10, 31, 23, 53, 46))
('tt1626146', 'Hector and the Search for Happiness (2014)', 2014, 6.9, 120.0, 'R', 'en', datetime.datetime(2022, 3, 29, 4, 36, 16))


### inserting into genre and movie_genre table

In [28]:
for genre in movie_a['genres']:
    cursor.execute("""
                declare @temp_genre_id int;
                if not exists (
                                select * from genre
                                where genre_title = ?
                              )
                begin
                    insert into genre values(?)
                end
        select @temp_genre_id  = genre_id from genre where genre_title = ?
        insert into movie_genre
        values(?, @temp_genre_id)
    """, genre, genre, genre, movie_a['imdb_code'])

In [29]:
cursor.execute('select *from genre')
for i in cursor:
    print(i)

(1, 'Action')
(2, 'Adventure')
(3, 'Comedy')
(4, 'Drama')
(5, 'Animation')
(6, 'Fantasy')
(7, 'Horror')
(8, 'Sci-Fi')


In [30]:
cursor.execute('select *from movie_genre')
for i in cursor:
    print(i)

('tt1626146', 1)
('tt1626146', 2)
('tt1626146', 3)
('tt1626146', 4)
('tt0082509', 1)
('tt0082509', 2)
('tt0082509', 5)
('tt0082509', 6)
('tt0082509', 7)
('tt0082509', 8)


### inserting into genre and movie_genre table

In [31]:
# before that, we need to scrape cast from imdb_page using movie id
cast_list = get_cast(movie_a['imdb_code'])
cast_list

['Don Francks',
 'Caroline Semple',
 'Richard Romanus',
 'Susan Roman',
 'Al Waxman',
 'Harvey Atkin',
 'John Candy',
 'Glenis Wootton Gross',
 'Marilyn Lightstone',
 'Jackie Burroughs',
 'Martin Lavut',
 'August Schellenberg',
 'John Vernon',
 'Eugene Levy',
 'Joe Flaherty',
 'Rodger Bumpass',
 'Douglas Kenney',
 'George Touliatos',
 'Zal Yanovsky',
 'Patty Freedman',
 'Warren Munson',
 'Alice Playten',
 'Harold Ramis',
 'Vlasta Vrana',
 'Mavor Moore',
 'Thor Bishopric',
 'Len Doncheff',
 'Cedric Smith',
 'Joseph Golland',
 'Charles Jolliffe',
 'Ned Conlon',
 'Robby the Robot',
 'Percy Rodrigues']

In [32]:
for actor in cast_list:
    cursor.execute("""
                declare @temp_actor_id int;
                if not exists (
                                select * from cast
                                where actor_name = ?
                              )
                begin
                        insert into cast values(?)
                end
        select @temp_actor_id  = actor_id from cast where actor_name = ?
        insert into movie_cast
        values(?, @temp_actor_id)
    """, actor, actor, actor, movie_a['imdb_code'])

In [33]:
cursor.execute(" select *from cast")
for i in cursor:
    print(i)

(2, 'Simon Pegg')
(3, 'Rosamund Pike')
(4, 'Tracy-Ann Oberman')
(5, 'Jean Reno')
(6, 'Veronica Ferres')
(7, 'Barry Atsma')
(8, 'Ming Zhao')
(9, 'Togo Igawa')
(10, 'Christopher Plummer')
(11, 'Bruce Fontaine')
(12, 'Chad Willett')
(13, 'Bernard Cuffling')
(14, 'Jakob Davies')
(15, 'Chris Gauthier')
(16, 'Dean Paul Gibson')
(17, 'Michael Adamthwaite')
(18, 'Vincent Gale')
(19, 'Gabrielle Rose')
(20, 'Thorsten Wedekind')
(21, 'Eileen Barrett')
(22, 'Alexa Damián')
(23, 'Malcolm Boddington')
(24, 'Stellan Skarsgård')
(25, 'Gribunina Marina')
(26, 'Megan White')
(27, 'Ingrid Lin')
(28, 'Ana Trujic')
(29, 'Wang Xiuze')
(30, 'Neil Li')
(31, 'Aaron Le')
(32, 'Joo Kyeong Lee')
(33, "S'Thandiwe Kgoroge")
(34, 'Christine le Brocq')
(35, 'Anthony Oseyemi')
(36, 'Dalias Blake')
(37, 'Sebelethu Bonkolo')
(38, 'Hlubi Mboya')
(39, 'Mary Twala')
(40, 'Sam Medupe')
(41, 'Charles Baloyi')
(42, 'Senzo Vilakazi')
(43, 'Akin Omotoso')
(44, 'Walter Chukwu')
(45, "Abraham's Seed")
(46, 'Tessa Jubber')
(47, 'S

In [34]:
cursor.execute(" select *from movie_cast")
for i in cursor:
    print(i)

('tt1626146', 2)
('tt1626146', 3)
('tt1626146', 4)
('tt1626146', 5)
('tt1626146', 6)
('tt1626146', 7)
('tt1626146', 8)
('tt1626146', 9)
('tt1626146', 10)
('tt1626146', 11)
('tt1626146', 12)
('tt1626146', 13)
('tt1626146', 14)
('tt1626146', 15)
('tt1626146', 16)
('tt1626146', 17)
('tt1626146', 18)
('tt1626146', 19)
('tt1626146', 20)
('tt1626146', 21)
('tt1626146', 22)
('tt1626146', 23)
('tt1626146', 24)
('tt1626146', 25)
('tt1626146', 26)
('tt1626146', 27)
('tt1626146', 28)
('tt1626146', 29)
('tt1626146', 30)
('tt1626146', 31)
('tt1626146', 32)
('tt1626146', 33)
('tt1626146', 34)
('tt1626146', 35)
('tt1626146', 36)
('tt1626146', 37)
('tt1626146', 38)
('tt1626146', 39)
('tt1626146', 40)
('tt1626146', 41)
('tt1626146', 42)
('tt1626146', 43)
('tt1626146', 44)
('tt1626146', 45)
('tt1626146', 46)
('tt1626146', 47)
('tt1626146', 48)
('tt1626146', 49)
('tt1626146', 50)
('tt1626146', 51)
('tt1626146', 52)
('tt1626146', 53)
('tt1626146', 54)
('tt1626146', 55)
('tt1626146', 56)
('tt1626146', 57)


### inserting into genre and movie_genre table

In [36]:
movie_a['summary']

'An astronaut brings home a glowing green orb for his daughter. However, the green orb wipes him out and corners the girl for its purposes. Claiming to embody ultimate evil, the malevolent sphere, known as the Loc-Nar, terrorizes the little girl by showing a series of bizarre and fantastic stories it has influenced. The first is &quot;Harry Canyon&quot;, a cynical taxi driver in a squalid futuristic New York who finds himself involved with a damsel in distress who is relentlessly pursued by murderous thugs who desire the Loc-Nar her archaeologist father found. The second is &quot;Den&quot;, which chronicles the adventures of a nerdish teenager who is thrown into the fantasy world of Neverwhere, where he is transformed into a handsome muscleman, desired by beautiful women, who must get involved in a conflict revolving around possession of the Loc-Nar. The third is &quot;Captain Sternn&quot;, where the title character is a handsome but irredeemable scoundrel who stands accused in a trial

In [None]:
cursor.execute("""

insert into summary (imdb_id, summary)
values (?, ?)

""", movie_a['imdb_code'], movie_a['summary'])