In [1]:
from model.movie.Movie import Movie
import requests
from bs4 import BeautifulSoup as bs
from peewee import *
from playhouse.sqlite_ext import *

In [2]:
host = 'localhost'
user = 'root'
password = 'password'
database = 'movies'

db = MySQLDatabase(database, host=host, user=user, password=password)

In [3]:
def scrape_movie_website(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
    }
    
    page = requests.get(url, headers=headers)
    soup = bs(page.content, 'html.parser')
            
    div = soup.find('div', class_='page_wrapper')
    div2 = div.find_all('div', class_='content')

    titles = []
    for div_element in div2:
        a_tags = div_element.find_all('a')
        for a_tag in a_tags:
            movie = (a_tag['title'], a_tag['href'])
            titles.append(movie)
    
    return titles

In [4]:
class BaseModel(Model):
    class Meta:
        database = db

class Genre(BaseModel):
    name = CharField(unique=True)
    
class Movie(BaseModel):
    name = CharField()
    url = CharField(unique=True)
    description = TextField()
    certification = CharField()
    release = DateField()
    genres = ManyToManyField(Genre)
    
class Website(BaseModel):
    url = CharField(unique=True)
    is_visited = BooleanField(default=False)

        
db.create_tables([Genre, Movie, Movie.genres.get_through_model(), Website])

In [22]:
def add_genre(name):
    try:
        genre = Genre.create(name=name)
        return genre
    except IntegrityError:
        print('Genre with the same name already exists.')

def add_movie(name, url):
    try:
        url = 'https://www.themoviedb.org' + url
        movie = Movie.create(name=name, url=url) 
    except IntegrityError:
        print('Movie with the same name already exists.')
        
def update_movie_details(url, description, certification, release, genres):
    try:
        release_date = datetime.datetime.strptime(release, '%m/%d/%Y').date()
        _genres = [add_genre(i) for i in genres]
        Movie.update(           
            description=description, 
            certification=certification, 
            release=release_date, 
            genres=_genres
        ).where(Movie.url == url).execute()        
    except IntegrityError:
        print('Movie not found.')

def add_website(url):
    try:
        website = Website.create(url=url)
    except IntegrityError:
        print('Website with same url already exists.')
        
def get_unvisited_websites():
    query = Website.select().where(Website.is_visited == False)
    return query

def mark_website_visited(url):
    try:        
        website = Website.get(Website.url == url)
        website.is_visited = True
        website.save()
    except IntegrityError:
        print('Website does not exist')

In [20]:
website_url = 'https://www.themoviedb.org/movie?page='

for i in range(1, 501):
    add_website(website_url + str(i))

Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with

Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with same url already exists.
Website with

In [23]:
unvisited_websites = get_unvisited_websites()

for index, page in enumerate(unvisited_websites):
    movies = scrape_movie_website(page.url)
    
    for title, url in movies:
        add_movie(title, url)
        
    mark_website_visited(page.url)
    if index == 0:
        break
    

In [10]:
for title, url in titles:
    movie = Movie(title, url)
    mar
    print(movie.get_url())

https://www.themoviedb.org/movie/385687
https://www.themoviedb.org/movie/603692
https://www.themoviedb.org/movie/502356
https://www.themoviedb.org/movie/569094
https://www.themoviedb.org/movie/667538
https://www.themoviedb.org/movie/1010581
https://www.themoviedb.org/movie/298618
https://www.themoviedb.org/movie/536437
https://www.themoviedb.org/movie/76600
https://www.themoviedb.org/movie/447277
https://www.themoviedb.org/movie/713704
https://www.themoviedb.org/movie/1074034
https://www.themoviedb.org/movie/447365
https://www.themoviedb.org/movie/890771
https://www.themoviedb.org/movie/882569
https://www.themoviedb.org/movie/840326
https://www.themoviedb.org/movie/640146
https://www.themoviedb.org/movie/758323
https://www.themoviedb.org/movie/697843
https://www.themoviedb.org/movie/594767
