<a href="https://colab.research.google.com/github/carolvieirav/Data-Analytics-Bootcamp/blob/master/blob/master/Ironhack-DTFT-2020-Jun/Projects/Cinetrash_Data_Scraping_and_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries 

In [1]:
import pandas as pd
import numpy as np
import re
import lxml
from bs4 import BeautifulSoup
from requests import get
from time import sleep
from random import randint
from tqdm import tqdm

In [33]:
from rake_nltk import Rake
from collections import Counter
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize

# Web Scraping

### Defining a Python Class to Extract IMDB's Keywords Pages

In [10]:
class IMDB(object):
    """
    
    This class operates like a universal IMDB's Keyword Page Extraction.
    
    """

    def __init__(self, url):
        
        '''
        
        This function initiate IMDB Class and gets the page links and content.
        
        '''
        
        super(IMDB, self).__init__()
        
        self.soups = []
        pages = str(np.arange(1,5))
        
        for page in tqdm(pages):
            
            page = url.replace('page=1', 'page='+(page))
            
            page = get(page)
            
            self.soup = BeautifulSoup(page.content, 'lxml')
            self.soups.append(self.soup)
            
            sleep(1)
            
        
    def articleTitle(self):
        
        '''
        
        This function gets the movie title from the page's articles.
        
        '''
        
        return self.soup.find("h1", class_="header").text.replace("\n","")

    def bodyContent(self, soup):
        
        '''
        
        This function gets the contents of one movie content.
        
        '''
        
        content = soup.find(id="main")
        return content.find_all("div", class_="lister-item mode-detail")

    def movieData(self):
        
        '''
        
        This function gets the title and data of the movie: 
        
        runtime, genre, rating, score, description, casts and directors.
        
        '''
        final = pd.DataFrame()
        
        
        for soup in self.soups:
            Frame = self.bodyContent(soup)
            Title = []
            Date = []
            RunTime = []
            Genre = []
            Description = []
            Director = []
            Stars = []
            Votes = []

            for movie in tqdm(Frame):
                FirstLine = movie.find("h3", class_="lister-item-header")

                # Get the Title
                Title.append(FirstLine.find("a").text)

                #  Get the Date
                Date.append(re.sub(r"[()]","", FirstLine.find_all("span")[-1].text))

                # Get RunTime
                try:
                    RunTime.append(movie.find("span", class_="runtime").text[:-4])
                except:
                    RunTime.append("unknown")

                #Get the Genres
                Genre.append(movie.find("span", class_="genre").text.rstrip().replace("\n","").split(","))

                #Get Description
                Description.append(movie.find("p", class_="").text.replace("\n","").strip())

                # Get movie Director and Stars
                movieCast = movie.find_all("p", class_="text-muted text-small")[1].text.lstrip()

                try:
                    casts = movieCast.replace("\n","").split('|')
                    casts = [x.strip() for x in casts]
                    casts = [casts[i].replace(j, "") for i,j in enumerate(["Director:", "Stars:"])]
                    Director.append(casts[0])
                    Stars.append([x.strip() for x in casts[1].split(",")])
                except:
                    casts = movieCast.replace("\n","").strip()
                    Director.append('unknown')
                    Stars.append([x.strip() for x in casts.split(",")])

                Numbers = movie.find_all("span", attrs={"name": "nv"})

                if len(Numbers) == 2:
                    Votes.append(Numbers[0].text)
                elif len(Numbers) == 1:
                    Votes.append(Numbers[0].text)
                else:
                    Votes.append("unknown")



            # Transforming all information lists in Panda Series and create a Dataframe.

            Frame = pd.Series(Frame)
            Title = pd.Series(Title)
            Date = pd.Series(Date)
            RunTime = pd.Series(RunTime)
            Genre = pd.Series(Genre)
            Description = pd.Series(Description)
            Director = pd.Series(Director)
            Stars = pd.Series(Stars)
            Votes = pd.Series(Votes)

            mdata = pd.DataFrame({'Title':Title, 
                                  'Date':Date, 
                                  'RunTime':RunTime, 
                                  'Genre':Genre, 
                                  'Description':Description, 
                                  'Director':Director, 
                                  'Stars':Stars, 
                                  'Votes':Votes, 
                                  })

            mdata.reset_index(drop=True)

            final = pd.concat([final, mdata])
        
        
        return final

### URLs that we'll be using

In [11]:
zmovie = "https://www.imdb.com/search/keyword/?keywords=z-movie&ref_=kw_ref_key&sort=moviemeter,asc&mode=detail&page=1"
camp = "https://www.imdb.com/search/keyword/?keywords=camp%2Ccampy%2Cb-movie&ref_=kw_ref_key&mode=detail&page=1&sort=moviemeter,asc"
exploitation = "https://www.imdb.com/search/keyword/?keywords=exploitation-film&ref_=kw_ref_key&sort=moviemeter,asc&mode=detail&page=1"
bmovie = "https://www.imdb.com/search/keyword/?keywords=b-movie&ref_=kw_ref_key&mode=detail&page=1&sort=moviemeter,asc"
trash = "https://www.imdb.com/search/keyword/?keywords=trash-movie&ref_=fn_kw_kw_1"
kungfu = "https://www.imdb.com/search/keyword/?keywords=kung-fu%2Cindependent-film&ref_=kw_ref_key&sort=user_rating,desc&mode=detail&page=1"
bwestern = "https://www.imdb.com/search/keyword/?keywords=b-western&ref_=fn_kw_kw_6"
kitsch = "https://www.imdb.com/search/keyword/?keywords=kitsch&ref_=fn_kw_kw_1"

keywords = [zmovie, camp, exploitation, bmovie, trash, kungfu, bwestern, kitsch]

In [12]:
Action = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Action&sort=moviemeter,asc'
Adventure = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Adventure&sort=moviemeter,asc'
Comedy = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_typ&sort=moviemeter,asc&mode=detail&page=1&title_type=movie&genres=Comedy'
Crime = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Crime&sort=moviemeter,asc'
Fantasy = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Fantasy&sort=moviemeter,asc'
Horror = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Horror&sort=moviemeter,asc'
Music = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Music&sort=moviemeter,asc'
Romance = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Romance&sort=moviemeter,asc'
SciFi = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Sci-Fi&sort=moviemeter,asc'
Western = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Western&sort=moviemeter,asc'
Drama = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Drama&sort=moviemeter,asc'
Biography = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Biography&sort=moviemeter,asc'
Thriller = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Thriller&sort=moviemeter,asc'
Sport = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Sport&sort=moviemeter,asc'
Family = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Family&sort=moviemeter,asc'
Mystery = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Mystery&sort=moviemeter,asc'
Animations = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Animation&sort=moviemeter,asc'
Documentary = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Documentary&sort=moviemeter,asc'
History = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=History&sort=moviemeter,asc'
War = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=War&sort=moviemeter,asc'
Musical = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_gnr&mode=detail&page=1&title_type=movie&genres=Musical&sort=moviemeter,asc'

genres = [Action, Adventure, Comedy, Crime, Fantasy, Horror, Music, Romance, SciFi, Western, Drama, Biography, Thriller, Sport, Family, Mystery, Animations, Documentary, History, War, Musical]

In [13]:
# Scraping movie data with IMDB class
imdb = [IMDB(x) for x in keywords]
mdata = [pd.DataFrame(x.movieData()) for x in imdb]

100%|██████████| 9/9 [00:19<00:00,  2.13s/it]
100%|██████████| 9/9 [00:21<00:00,  2.40s/it]
100%|██████████| 9/9 [00:20<00:00,  2.25s/it]
100%|██████████| 9/9 [00:20<00:00,  2.25s/it]
100%|██████████| 9/9 [00:18<00:00,  2.06s/it]
100%|██████████| 9/9 [00:42<00:00,  4.68s/it]
100%|██████████| 9/9 [00:20<00:00,  2.26s/it]
100%|██████████| 9/9 [00:19<00:00,  2.15s/it]
100%|██████████| 50/50 [00:00<00:00, 479.15it/s]
100%|██████████| 50/50 [00:00<00:00, 483.04it/s]
100%|██████████| 50/50 [00:00<00:00, 480.06it/s]
100%|██████████| 50/50 [00:00<00:00, 466.08it/s]
100%|██████████| 50/50 [00:00<00:00, 460.90it/s]
100%|██████████| 50/50 [00:00<00:00, 489.66it/s]
100%|██████████| 50/50 [00:00<00:00, 472.11it/s]
100%|██████████| 50/50 [00:00<00:00, 429.85it/s]
100%|██████████| 50/50 [00:00<00:00, 479.54it/s]
100%|██████████| 50/50 [00:00<00:00, 451.92it/s]
100%|██████████| 50/50 [00:00<00:00, 439.33it/s]
100%|██████████| 50/50 [00:00<00:00, 445.27it/s]
100%|██████████| 50/50 [00:00<00:00, 456.03i

In [17]:
trash_movies = mdata[0].append([mdata[1],mdata[2],mdata[3],mdata[4],mdata[5],mdata[6],mdata[7]])

In [18]:
imdb2 = [IMDB(x) for x in genres]
mdata2 = [pd.DataFrame(x.movieData()) for x in imdb2]

100%|██████████| 9/9 [00:23<00:00,  2.65s/it]
100%|██████████| 9/9 [00:21<00:00,  2.41s/it]
100%|██████████| 9/9 [00:21<00:00,  2.34s/it]
100%|██████████| 9/9 [00:25<00:00,  2.85s/it]
100%|██████████| 9/9 [00:21<00:00,  2.33s/it]
100%|██████████| 9/9 [00:21<00:00,  2.34s/it]
100%|██████████| 9/9 [00:22<00:00,  2.51s/it]
100%|██████████| 9/9 [00:21<00:00,  2.37s/it]
100%|██████████| 9/9 [00:21<00:00,  2.38s/it]
100%|██████████| 9/9 [00:22<00:00,  2.49s/it]
100%|██████████| 9/9 [00:23<00:00,  2.58s/it]
100%|██████████| 9/9 [00:19<00:00,  2.22s/it]
100%|██████████| 9/9 [00:20<00:00,  2.26s/it]
100%|██████████| 9/9 [00:20<00:00,  2.25s/it]
100%|██████████| 9/9 [00:23<00:00,  2.66s/it]
100%|██████████| 9/9 [00:22<00:00,  2.53s/it]
100%|██████████| 9/9 [00:20<00:00,  2.28s/it]
100%|██████████| 9/9 [00:21<00:00,  2.36s/it]
100%|██████████| 9/9 [00:21<00:00,  2.34s/it]
100%|██████████| 9/9 [00:21<00:00,  2.36s/it]
100%|██████████| 9/9 [00:21<00:00,  2.38s/it]
100%|██████████| 50/50 [00:00<00:0

In [20]:
pop_movies = mdata2[0].append([mdata2[1],mdata2[2],mdata2[3],mdata2[4],mdata2[5],mdata2[6],mdata2[7]])

In [21]:
movies_lst = [pop_movies, trash_movies]

In [22]:
pop_movies

Unnamed: 0,Title,Date,RunTime,Genre,Description,Director,Stars,Votes
0,The Old Guard,2020,125,"[Action, Adventure, Fantasy]",A covert team of immortal mercenaries are sudd...,Gina Prince-Bythewood,"[Charlize Theron, KiKi Layne, Matthias Schoena...",74725
1,Greyhound,2020,91,"[Action, Drama, History]","Early in World War II, an inexperienced U.S. N...",Aaron Schneider,"[Tom Hanks, Elisabeth Shue, Stephen Graham, Ma...",33859
2,Rogue,I 2020,unknown,[Action],Megan Fox (Transformers franchise) tackles a t...,M.J. Bassett,"[Megan Fox, Jessica Sutton, Lee-Anne Liebenber...",unknown
3,The New Mutants,2020,98,"[Action, Horror, Sci-Fi]","Five young mutants, just discovering their abi...",Josh Boone,"[Maisie Williams, Anya Taylor-Joy, Charlie Hea...",unknown
4,Tenet,2020,150,"[Action, Sci-Fi, Thriller]",Armed with only one word -- Tenet -- and fight...,Christopher Nolan,"[John David Washington, Robert Pattinson, Eliz...",unknown
...,...,...,...,...,...,...,...,...
45,Downton Abbey,2019,122,"[Drama, Romance]","The continuing story of the Crawley family, we...",Michael Engler,"[Stephen Campbell Moore, Michael Fox, Lesley N...",34764
46,The Princess and the Frog,2009,97,"[Animation, Adventure, Comedy]","A waitress, desperate to fulfill her dreams as...","Directors:Ron Clements, John Musker","[Anika Noni Rose, Keith David, Oprah Winfrey, ...",119848
47,Stardust,2007,127,"[Adventure, Family, Fantasy]",In a countryside town bordering on a magical l...,Matthew Vaughn,"[Charlie Cox, Claire Danes, Sienna Miller, Ian...",249229
48,La La Land,2016,128,"[Comedy, Drama, Music]","While navigating their careers in Los Angeles,...",Damien Chazelle,"[Ryan Gosling, Emma Stone, Rosemarie DeWitt, J...",485107


In [24]:
trash_movies

Unnamed: 0,Title,Date,RunTime,Genre,Description,Director,Stars,Votes
0,Son of the Mask,2005,94,"[Comedy, Family, Fantasy]","Tim Avery, an aspiring cartoonist, finds himse...",Lawrence Guterman,"[Jamie Kennedy, Traylor Howard, Alan Cumming, ...",51481
1,BloodRayne,2005,95,"[Action, Adventure, Fantasy]","In the eighteenth century, a vampire escapes f...",Uwe Boll,"[Kristanna Loken, Ben Kingsley, Michelle Rodri...",33750
2,Las Vampiras,1971,89,"[Drama, Horror]",An erotic horror tale about a vixen vampiress ...,Jesús Franco,"[Soledad Miranda, Dennis Price, Paul Muller, E...",4164
3,Troll 2,1990,95,"[Comedy, Fantasy, Horror]",A vacationing family discovers that the entire...,Claudio Fragasso,"[Michael Paul Stephenson, George Hardy, Margo ...",30231
4,In the Name of the King: A Dungeon Siege Tale,2007,127,"[Action, Adventure, Fantasy]",A man named Farmer sets out to rescue his kidn...,Uwe Boll,"[Jason Statham, Ron Perlman, Ray Liotta, Leele...",43800
...,...,...,...,...,...,...,...,...
45,Ein Sommer in Amalfi,2013 TV Movie,89,"[Comedy, Romance]","German travel guide researcher Kai, a gay play...",Jorgo Papavassiliou,"[Ann-Kathrin Kramer, Carlos Leal, Steffen Grot...",43
46,Drei Mädels vom Rhein,1955,90,"[Comedy, Romance]",Add a Plot,Georg Jacoby,"[Gardy Granass, Margit Saad, Fita Benkhoff, To...",23
47,Circus Girl,1954,103,"[Adventure, Romance]",Add a Plot,Veit Harlan,"[Kristina Söderbaum, Willy Birgel, Adrian Hove...",30
48,Unser Doktor ist der Beste,1969,85,"[Comedy, Romance]","Dr Sommer is newly appointed to the hospital, ...",Harald Vock,"[Roy Black, Helga Anders, Peter Weck, Christia...",81


# Data Cleaning

In [25]:
def drop(x):
    
    '''
    
    This function recevies a Data Frame generated by IMDB extracting class;
    shows the initial shape;
    drop all NaN values and duplicates filtered by both title and date collumns.
    
    '''
    
    print(f'Initial Shape: {x.shape}')
    print(f'Number of rolls that contain NaN values: {x.isna().sum()}')
    x = x.dropna()
    x = x.drop_duplicates(subset=['Title','Date'])
    print(f'Final Shape: {x.shape}')
    
    return x

In [26]:
# applying the drop function modifications into Data Frames 

clear = [pd.DataFrame(drop(x)) for x in movies_lst]
pop_movies = clear[0]
trash_movies = clear[1]

Initial Shape: (3600, 8)
Number of rolls that contain NaN values: Title          0
Date           0
RunTime        0
Genre          0
Description    0
Director       0
Stars          0
Votes          0
dtype: int64
Final Shape: (1117, 8)
Initial Shape: (3600, 8)
Number of rolls that contain NaN values: Title          0
Date           0
RunTime        0
Genre          0
Description    0
Director       0
Stars          0
Votes          0
dtype: int64
Final Shape: (987, 8)


#### Using Rake function from NLTK library to extract key words from the Description column

In [34]:
# initializing the new column
pop_movies['Key_words'] = ""

for index, row in pop_movies.iterrows():
    plot = row['Description']
    
    # instantiating Rake
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())
    
pop_movies['Key_words']

0     [unexpected, new, member, discovered, fight, i...
1     [navy, captain, must, lead, early, stalked, na...
2     [lively, squad, ..., see, full, summary, », me...
3     [held, five, young, mutants, fight, secret, fa...
4     [international, espionage, protagonist, journe...
                            ...                        
44    [disguised, couple, weeks, viola, heads, fall,...
46    [newly, cousin, wanted, local, cop, unemployed...
47    [murder, turned, upside, accused, wife, life, ...
48    [vienna, train, europe, unfortunately, night, ...
49    [world, simple, bookshop, owner, changes, famo...
Name: Key_words, Length: 1117, dtype: object

In [35]:
# Applying the same for trash_movies data frame
trash_movies['Key_words'] = ""

for index, row in trash_movies.iterrows():
    plot = row['Description']
    r = Rake()
    r.extract_keywords_from_text(plot)
    key_words_dict_scores = r.get_word_degrees()
    row['Key_words'] = list(key_words_dict_scores.keys())
    
trash_movies['Key_words']

0     [looney, child, raising, finds, loki, infant, ...
1     [vampire, slayers, escapes, kill, freak, show,...
2     [killing, women, vixen, vampiress, seducing, e...
3     [plan, disguised, entire, town, goblins, inhab...
4     [kidnapped, wife, race, warriors, evil, gallia...
                            ...                        
45    [italian, stallions, amalfi, ..., see, full, s...
46                                          [add, plot]
47                                          [add, plot]
48    [gets, romantic, ..., see, full, summary, », n...
49                                          [add, plot]
Name: Key_words, Length: 987, dtype: object

In [36]:
# transforming all columns Title in Index and drop duplicates 
movies = pd.concat([trash_movies, pop_movies])
movies.index = movies.Title
movies.drop(columns=['Title'], inplace=True)

In [37]:
trash_movies.index = trash_movies.Title
trash_movies.drop(columns=['Title'], inplace=True)

In [38]:
pop_movies.index = pop_movies.Title
pop_movies.drop(columns=['Title'], inplace=True)

In [39]:
movies.index.drop_duplicates()

Index(['Son of the Mask', 'BloodRayne', 'Las Vampiras', 'Troll 2',
       'In the Name of the King: A Dungeon Siege Tale', 'Far Cry',
       'Piranha II: The Spawning', 'House of the Dead', 'Alone in the Dark',
       'Postal',
       ...
       'Love Wedding Repeat', 'A Hidden Life', 'The Virgin Suicides',
       'Inherent Vice', 'All the Bright Places', 'She's the Man',
       'One for the Money', 'The Next Three Days', 'Before Sunrise',
       'Notting Hill'],
      dtype='object', name='Title', length=2057)

In [40]:
pop_movies.index.drop_duplicates()

Index(['The Old Guard', 'Greyhound', 'Rogue', 'The New Mutants', 'Tenet',
       'The Gentlemen', 'The Silencing', 'Peninsula', 'Project Power',
       'Midway',
       ...
       'Love Wedding Repeat', 'A Hidden Life', 'The Virgin Suicides',
       'Inherent Vice', 'All the Bright Places', 'She's the Man',
       'One for the Money', 'The Next Three Days', 'Before Sunrise',
       'Notting Hill'],
      dtype='object', name='Title', length=1098)

In [41]:
trash_movies.index.drop_duplicates()

Index(['Son of the Mask', 'BloodRayne', 'Las Vampiras', 'Troll 2',
       'In the Name of the King: A Dungeon Siege Tale', 'Far Cry',
       'Piranha II: The Spawning', 'House of the Dead', 'Alone in the Dark',
       'Postal',
       ...
       '881', 'Im Prater blüh'n wieder die Bäume',
       'Ela Que Mora no Andar de Cima', 'Der Edelweißkönig',
       'Schwarzwaldmädel', 'Ein Sommer in Amalfi', 'Drei Mädels vom Rhein',
       'Circus Girl', 'Unser Doktor ist der Beste', 'Mariandls Heimkehr'],
      dtype='object', name='Title', length=984)

# Datasets to CSV

In [42]:
trash_movies.to_csv('trash.csv')

In [43]:
pop_movies.to_csv('pop.csv')

In [44]:
movies.to_csv('movies.csv')