## New IMDb Keyword Search
Input a list of keywords into a scraper to get a dataset of movies on IMDb with any of those keywords.

To run a smaller retrieval (max 50 items per keyword), set `is_test` to True. 

For an even smaller test (17 items) I like to run "scary-clowns" as the keyword.

> _**NOTE:** If you get an `IMDbDataAccessError`, you've hit the rate limit and IMDb has blocked you from retrieving more data._

In [None]:
# ADD KEYWORDS
my_keywords = [""]
scraper = MovieScraper(keywords = my_keywords, is_test = True)

In [None]:
# GET CSV 
filename = 'example-file' # don't include '.csv'
scraper.export_csv(filename) 

In [None]:
# READ THE CSV
df = pd.read_csv(filename + '.csv')
df

### Movie Scraper Class

In [None]:
%pip install git+https://github.com/cinemagoer/cinemagoer
%pip install --upgrade pip

In [1]:
from IPython.display import clear_output
from imdb import Cinemagoer
import pandas as pd
import time
import requests

ia = Cinemagoer()

In [2]:
class MovieScraper:
    def __init__(self, keywords, is_test):
        # Create list of IDs
        if (is_test):
            self.ids = self.get_test_ids(keywords)
        else:
            self.ids = self.get_full_ids(keywords)
        # Create movies dataframe 
        self.movies_df = self.create_dataframe()

    def export_csv(self, filename):
        filename += '.csv'
        self.movies_df.to_csv(filename, index=False)

    def get_full_ids(self, keywords):
        search_id = []
        for key in keywords:
            page_items = 1
            i = 1
            search = []
            while (page_items > 0):
                new_collect = ia.get_keyword(key, page=i)
                search.extend(new_collect)
                page_items = len(new_collect)
                i += 1
                print("Retrieved page", i)

            search_id.extend([title.movieID for title in search])
            print(key, ": Collected", len(search_id), "IDs over", i-2, "pages")
            print(search_id)
        og_length = len(search_id)
        search_id = [*set(search_id)] # Removes duplicate ids from overlaps in keywords
        print("Total:", len(search_id), "IDs collected - Removed", og_length-len(search_id), "duplicates")
        return search_id
    
    def get_test_ids(self, keywords):
        search_id = []
        for key in keywords:    
            search = ia.get_keyword(key, page=1)
            search_id.extend([title.movieID for title in search])
            print(key, ": Collected", len(search_id), "IDs")
        og_length = len(search_id)
        search_id = [*set(search_id)] # Removes duplicate ids from overlaps in keywords
        print("Total:", len(search_id), "IDs collected - Removed", og_length-len(search_id), "duplicates")
        return search_id
    
    # CREATE DATA FRAME
    def create_dataframe(self):
        imdb_data = []
        error_counter = 0
        id_list = self.ids
        for id in id_list:
            try:
                movie_info = self.get_movie_info(id)
            except:
                error_counter += 1
                print('Error occurred:', ia.get_movie(id)['title'])
                movie_info = {'imdbID': None,
                    'title': None,
                    'production companies': None,
                    'director': None,
                    'producer': None,
                    'genres': None,
                    'keywords': None,
                    'rating': None,
                    'votes': None
                    }
            imdb_data.append(movie_info)

        # Remove empty rows from imdb_data
        imdb_data = [item for item in imdb_data if item is not None and item['imdbID'] is not None]
        # Report errors and removed
        print("Completed with", error_counter, "Errors and", (len(id_list) - len(imdb_data) - error_counter), "Removed Value(s)")
        df = pd.DataFrame(imdb_data)
        return df

    # HELPER - Returns information for media based on inputted ID if media is a movie. If not, returns None and prints that it was removed
    def get_movie_info(self, id):
        res = ia.get_movie(id)
        ia.update(res, 'keywords')
        
        if (res['kind'] == "movie"):
            return {'imdbID': id,
                'title': res.get('title'),
                'production companies': self.normalize_objects(res.get('production companies')),
                'director': self.normalize_objects(res.get('director')),
                'producer': self.normalize_objects(res.get('producer')),
                'genres': res.get('genres'),
                'keywords': res.get('keywords'),
                'rating': res.get('rating'),
                'votes': res.get('votes')
                }
        else:
            #print("Removed - Not a Movie: ", ia.get_movie(id)['title'])

    # HELPER - Method takes a list of Person or Company objects and returns it as a list of dicts with an ID and name
    def normalize_objects(self, objects):
        # Stores type value for IMDb Person object 
        test = ia.get_movie('0380510', ['main', 'vote details'])
        personType = type(test.get('producer')[0])
        
        new_list = []
        for item in objects:
            if type(item) is personType:
                id = item.personID
            else:
                id = item.companyID
            name = item.data['name']
            new_list.append({
                'id': id,
                'name': name
            })
        return new_list