In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import networkx as nx
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import LatentDirichletAllocation as LDA
import random
import os
import scrapy
from scrapy.crawler import CrawlerProcess
%matplotlib inline

comedy = pd.read_json('CMovies.json', orient='records')
horror = pd.read_json('HorMovies.json', orient='records')
action = pd.read_json('ActMovies.json', orient='records')
crime = pd.read_json('CriMovies.json', orient='records')
drama = pd.read_json('DraMovies.json', orient='records')
family = pd.read_json('FamMovies.json', orient='records')
history = pd.read_json('HisMovies.json', orient='records')
romance = pd.read_json('RomMovies.json', orient='records')
scifi = pd.read_json('SFMovies.json', orient='records')
thriller = pd.read_json('ThrMovies.json', orient='records')
western = pd.read_json('WesMovies.json', orient='records')

# Importing movie titles
comedy_title = pd.read_json('Comedy.json', orient='records')
horror_title = pd.read_json('Horror.json', orient='records')
action_title = pd.read_json('Action.json', orient='records')
crime_title = pd.read_json('Crime.json', orient='records')
drama_title = pd.read_json('Drama.json', orient='records')
family_title = pd.read_json('Family.json', orient='records')
history_title = pd.read_json('History.json', orient='records')
romance_title = pd.read_json('Romance.json', orient='records')
scifi_title = pd.read_json('SciFi.json', orient='records')
thriller_title = pd.read_json('Thriller.json', orient='records')
western_title = pd.read_json('Western.json', orient='records')

In [2]:
def clean_data(lst):
    for item in lst:
        # Clean up year information
        item.Year = item.Year.apply(lambda x: str(x).replace('–','') )
        item.Year = item.Year.apply(lambda x: int(x))
        
        #Get rid of N/A in Plot and rating
        item.Plot = item.Plot.apply(lambda x: str(x).replace('N/A', ''))
        item.Rating = item.Rating.apply(lambda x: str(x).replace('N/A', '0'))
        item.Rating = item.Rating.apply(lambda x: float(x))
        
    return lst

genre_data = [comedy,horror,action,crime,drama,family,history,romance,scifi,thriller,western]
genre_data = clean_data(genre_data)

In [3]:
# We will use plot, rating, and year for all of our classifiers
vectorizer = TfidfVectorizer(lowercase=False, 
                          stop_words=None,
                          ngram_range=(1, 1), 
                      
                          max_df=.5, 
                          min_df=1,
                          max_features=None, 
                          vocabulary=None, 
                          binary=False)

comPlot = vectorizer.fit_transform(comedy.Plot)
com_terms = vectorizer.get_feature_names()
horPlot = vectorizer.fit_transform(horror.Plot)
hor_terms = vectorizer.get_feature_names()
famPlot = vectorizer.fit_transform(family.Plot)
fam_terms = vectorizer.get_feature_names()
actPlot = vectorizer.fit_transform(action.Plot)
act_terms = vectorizer.get_feature_names()
criPlot = vectorizer.fit_transform(crime.Plot)
cri_terms = vectorizer.get_feature_names()
draPlot = vectorizer.fit_transform(drama.Plot)
dra_terms = vectorizer.get_feature_names()
hisPlot = vectorizer.fit_transform(history.Plot)
his_terms = vectorizer.get_feature_names()
romPlot = vectorizer.fit_transform(romance.Plot)
rom_terms = vectorizer.get_feature_names()
sfPlot = vectorizer.fit_transform(scifi.Plot)
sf_terms = vectorizer.get_feature_names()
thrPlot = vectorizer.fit_transform(thriller.Plot)
thr_terms = vectorizer.get_feature_names()
wesPlot = vectorizer.fit_transform(western.Plot)
wes_terms = vectorizer.get_feature_names()

### Comedy Classifier

In [4]:
# Create data for whether the movie falls under that genre (even as subgenre)
def IsGenre(data,genre):
    isGenre = []
    for item in data.Genre:
        item = item.replace(',','')
        check = item.split(' ')
        if genre in check:
            isGenre.append(1)
        else:
            isGenre.append(0)
    return isGenre

# Process the plot data and create new dataframe with the processed data
def create_data(vect, data, genre):
    tfidf = pd.DataFrame(vect.todense()).sum(axis=1)
    
    isGenre = pd.DataFrame(IsGenre(data, genre))
    
    # Combine all created data to final dataframe
    new = pd.concat([tfidf, isGenre, data.loc[:,['Rating','Year']]], axis=1)
    new.columns = ['tfidf','isGenre','Rating','Year']
    return new

# Creating all of the data sets for comedy training
def create_training(gen):
    comedy_pro = create_data(comPlot, comedy, gen)
    horror_pro = create_data(horPlot, horror, gen)
    family_pro = create_data(famPlot, family, gen)
    action_pro = create_data(actPlot, action, gen)
    crime_pro = create_data(criPlot, crime, gen)
    drama_pro = create_data(draPlot, drama, gen)
    history_pro = create_data(hisPlot, history, gen)
    romance_pro = create_data(romPlot, romance, gen)
    scifi_pro = create_data(sfPlot, scifi, gen)
    thriller_pro = create_data(thrPlot, thriller, gen)
    western_pro = create_data(wesPlot, western, gen)
    return [comedy_pro, horror_pro, family_pro, action_pro,crime_pro,drama_pro,history_pro,romance_pro,\
           scifi_pro,thriller_pro,western_pro]

#Creating the master dataset to use for training
com_data = create_training('Comedy')
x_com = pd.concat(com_data[:4])

#Training the comedy random forest classifier
rfc_com = RandomForestClassifier(max_depth=10)

def fit_data(lst, data):
    for item in lst:
        y_train = data.isGenre
        x_train = data.drop(columns=['isGenre'])
        item.fit(x_train, y_train)

fit_data([rfc_com], x_com)

#Testing on a couple genres that were not in training

def test_data(rfc_10, ind, data):
    names = ['comedy','horror','family','action','crime','drama',\
             'history','romance','scifi','thriller','western']
    gen = names[ind]
    
    x_test = data[ind].drop(columns=['isGenre'])
    y_test = data[ind].isGenre
    print('The score for {} is {}.'.format(gen,rfc_10.score(x_test,y_test)))

# Drama
test_data(rfc_com, 5, com_data)

#Crime
test_data(rfc_com, 4, com_data)

The score for drama is 0.8387096774193549.
The score for crime is 0.88.


### Horror Classifier

In [5]:
#Creating the master dataset to use for training
hor_data = create_training('Horror')
x_hor = pd.concat(hor_data[:4])

#Training the comedy random forest classifier
rfc_hor = RandomForestClassifier(max_depth=10)

#Fitting Data
fit_data([rfc_hor], x_hor)

#Testing on a couple genres that were not in training
test_data(rfc_hor, 4, hor_data)
test_data(rfc_hor, 5, hor_data)

The score for crime is 0.94.
The score for drama is 0.967741935483871.


### Family Classifier

In [6]:
#Creating the master dataset to use for training
fam_data = create_training('Family')
x_fam = pd.concat(fam_data[0:5])

#Training the comedy random forest classifier
rfc_fam = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_fam], x_fam)

#Testing on a couple genres that were not in training
test_data(rfc_fam, 5, fam_data)
test_data(rfc_fam, 8, fam_data)

The score for drama is 1.0.
The score for scifi is 0.94.


### Action Classifier

In [7]:
#Creating the master dataset to use for training
act_data = create_training('Action')
x_act = pd.concat(act_data[1:])

#Training the comedy random forest classifier
rfc_act = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_act], x_act)

#Testing on a couple genres that were not in training
test_data(rfc_act, 0, act_data)

The score for comedy is 0.94.


### Crime Classifier

In [8]:
#Creating the master dataset to use for training
cri_data = create_training('Crime')
x_cri = pd.concat(cri_data[2:6])

#Training the comedy random forest classifier
rfc_cri = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_cri], x_cri)

#Testing on a couple genres that were not in training
test_data(rfc_cri, 0, cri_data)
test_data(rfc_cri, 8, cri_data)

The score for comedy is 0.78.
The score for scifi is 0.74.


### Drama Classifier 

In [9]:
#Creating the master dataset to use for training
dra_data = create_training('Drama')
x_dra = pd.concat(dra_data[2:])

#Training the comedy random forest classifier
rfc_dra = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_dra], x_dra)

#Testing on a couple genres that were not in training
test_data(rfc_dra, 0, dra_data)
test_data(rfc_dra, 1, dra_data)

The score for comedy is 0.76.
The score for horror is 0.7254901960784313.


### History Classifier

In [10]:
#Creating the master dataset to use for training
his_data = create_training('History')
x_his = pd.concat(his_data[0:7])

#Training the comedy random forest classifier
rfc_his = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_his], x_his)

#Testing on a couple genres that were not in training
test_data(rfc_his, 7, his_data)
test_data(rfc_his, 8, his_data)

The score for romance is 0.96.
The score for scifi is 1.0.


### Romance Classifier

In [11]:
#Creating the master dataset to use for training
rom_data = create_training('Romance')
x_rom = pd.concat(cri_data[2:8])

#Training the comedy random forest classifier
rfc_rom = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_rom], x_rom)

#Testing on a couple genres that were not in training
test_data(rfc_rom, 0, rom_data)
test_data(rfc_rom, 8, rom_data)

The score for comedy is 0.82.
The score for scifi is 0.94.


### Sciene Fiction Classifier

In [12]:
#Creating the master dataset to use for training
sf_data = create_training('Sci-Fi')
x_sf = pd.concat(sf_data[2:9])

#Training the comedy random forest classifier
rfc_sf = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_sf], x_sf)

#Testing on a couple genres that were not in training
test_data(rfc_sf, 0, sf_data)
test_data(rfc_sf, 1, sf_data)

The score for comedy is 0.9.
The score for horror is 0.8431372549019608.


### Thriller Classifier

In [13]:
#Creating the master dataset to use for training
thr_data = create_training('Thriller')
x_thr = pd.concat(thr_data[5:])

#Training the comedy random forest classifier
rfc_thr = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_thr], x_thr)

#Testing on a couple genres that were not in training
test_data(rfc_thr, 0, thr_data)
test_data(rfc_thr, 2, thr_data)

The score for comedy is 1.0.
The score for family is 1.0.


### Western Classifier

In [14]:
#Creating the master dataset to use for training
wes_data = create_training('Western')
x_wes = pd.concat(wes_data[5:])

#Training the comedy random forest classifier
rfc_wes = RandomForestClassifier(max_depth=10)
    
fit_data([rfc_wes], x_wes)

#Testing on a couple genres that were not in training
test_data(rfc_wes, 0, wes_data)
test_data(rfc_wes, 1, wes_data)

The score for comedy is 0.98.
The score for horror is 0.9411764705882353.


In [16]:
def get_summary(data_plot_sent):
    parser = spacy.load('en')

    # Parsing Gatsby.
    plot_doc = parser(data_plot_sent)

# Dividing the text into sentences and storing them as a list of strings.
    sentences=[]
    for span in plot_doc.sents:
        # go from the start to the end of each span, returning each token in the sentence
        # combine each token using join()
        sent = ''.join(plot_doc[i].string for i in range(span.start, span.end)).strip()
        sentences.append(sent)
    
    vect_word = TfidfVectorizer(lowercase=False, 
                              stop_words='english',
                              ngram_range=(1, 1), 
                              analyzer=u'word', 
                              min_df=1,
                              max_features=None, 
                              vocabulary=None, 
                              binary=False)

    tfidf = vect_word.fit_transform(sentences)
    # Calculating similarity
    similarity = tfidf * tfidf.T

    # Identifying the sentence with the highest rank.
    nx_graph = nx.from_scipy_sparse_matrix(similarity)
    ranks=nx.pagerank(nx_graph, alpha=.85, tol=.00000001)

    ranked = sorted(((ranks[i],s) for i,s in enumerate(sentences)),
                    reverse=True)


    return ranked[0][1]

Most classifiers are performing very well. We will move on to building the recommender that will classify an input. Likely will include a topic comparison to verify the classifier's accruacy.

In [17]:
def create_fav_data(fav_info):
    vect = TfidfVectorizer(lowercase=False, 
                          stop_words=None,
                          ngram_range=(1, 1), 
                       
                          min_df=1,
                          max_features=None, 
                          vocabulary=None, 
                          binary=False)
    fav_plot = vect.fit_transform(fav_info.Plot)
    info = create_data(fav_plot,fav_info, 'Comedy')
    return info.drop(columns=['isGenre'])

def get_movie_info(titles):
    importlist = titles
    
    #Define crawler and processer
    class ImportSpider(scrapy.Spider):
        name = "ImportedMovie"
        start_urls=[]

        # Initiating Start URLs
        for i in range(len(importlist)):
            item = str(importlist[i])
            start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

        # Identifying the information we want from the query response and extracting it using xpath.
        def parse(self, response):
            for item in response.xpath('//movie'):
                # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
                yield {
                        'Title': item.xpath('@title').extract_first(),
                    'Year': item.xpath('@year').extract_first(),
                    'Genre': item.xpath('@genre').extract_first(),
                    'Rating': item.xpath('@imdbRating').extract_first(),
                    'Plot': item.xpath('@plot').extract_first()

                }
            
    process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'ImportedData.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False   
    })
    
    #Delete file if it already exists
    if os.path.exists('ImportedData.json'):
        os.remove('ImportedData.json')
    
    
    process.crawl(ImportSpider)
    process.start()
    
    info = pd.read_json('ImportedData.json')
    data = create_fav_data(info)
    return data

In [18]:


classifiers = [("Comedy", rfc_com), ("Horror", rfc_hor),("Family", rfc_fam),("Action",rfc_act),\
              ("Crime", rfc_cri),("Drama",rfc_dra),("History",rfc_his),("Romance",rfc_rom),\
              ("Science Fiction",rfc_sf),("Thriller",rfc_thr),("Western",rfc_wes)]

titles = [("Comedy", comedy_title), ("Horror", horror_title),("Family", family_title),("Action",action_title),\
              ("Crime", crime_title),("Drama",drama_title),("History",history_title),("Romance",romance_title),\
              ("Science Fiction",scifi_title),("Thriller",thriller_title),("Western",western_title)]

datas = [("Comedy", comedy), ("Horror", horror),("Family", family),("Action",action),\
              ("Crime", crime),("Drama",drama),("History",history),("Romance",romance),\
              ("Science Fiction",scifi),("Thriller",thriller),("Western",western)]

def get_profile(cfrs, test_input):
    profile = []
    max_input=0
    for item in cfrs:
        genre = item[0]
        cf = item[1]
        prediction = cf.predict(test_input)
        if prediction.sum() > max_input:
            max_input = prediction.sum()
    for item in cfrs:
        genre = item[0]
        cf = item[1]
        prediction = cf.predict(test_input)
        if prediction.sum() == max_input:
            profile.append(genre)

    return profile

def get_recommendations(cfrs, ttl):
    
    movie_list = []
    
    movie_list.append(input("What is the first movie you like?"))
    movie_list.append(input("What is the second movie you like?"))
    movie_list.append(input("What is the third movie you like?"))
    
    test_input = x_com.iloc[:4,:].drop(columns=['isGenre'])
    actual_input = get_movie_info(movie_list)
    
    fav_info = actual_input
    
    profile = get_profile(cfrs, fav_info)
    
    # Random pick from liked genres
    ind = random.randint(0,len(profile)-1)
    genre = profile[ind]
    
    
    # Getting title list
    for item in ttl:
        if item[0] == genre:
            ttls = item[1]
            break
    
    # Random pick in appropriate genre
    ind2 = random.randint(0,len(ttls)-1)
    pick = ttls.iloc[ind2,0]
    
    for g in datas:
        if g[0] == genre:
            plot_data = g[1]
            
    plot_sum = get_summary(plot_data.iloc[ind2,:].Plot)

    return pick, plot_sum

pick, plot_summ = get_recommendations(classifiers,titles)

print("\n You should watch {}. \n You will like it because: \n {}".format(pick, plot_summ))


What is the first movie you like?Batman
What is the second movie you like?Lego Movie
What is the third movie you like?The Avengers

 You should watch None. 
 Patrick Bateman is handsome, well educated and intelligent.


# Appendix:
### A. Gathering information.
Collecting popular titles by genre from Ranker.com via webscraper and collecting the movie data (plot, ranking, and year) from IMDB using their open api OMDB.

In [19]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class ComedySpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Comedy"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/crowdranked-list/100-all-time-greatest-comedy-films/',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Comedy.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(ComedySpider)
# process.start()
# print('Comedy Extracted!')

In [20]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class HorrorSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Horror"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/crowdranked-list/the-greatest-horror-films-ever-made/',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Horror.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(HorrorSpider)
# process.start()
# print('Horror Extracted!')

In [21]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class ActionSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Action"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/crowdranked-list/best-action-movies',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Action.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(ActionSpider)
# process.start()
# print('Action Extracted!')

In [22]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class CrimeSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Crime"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/list/all-crime-movies-or-list-of-crime-movies/all-genre-movies-lists',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Crime.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(CrimeSpider)
# process.start()
# print('Crime Extracted!')

In [23]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class DramaSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Drama"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/list/best-intelligent-dramas/ranker-film?ref=search',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Drama.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(DramaSpider)
# process.start()
# print('Drama Extracted!')

In [24]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class FamilySpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Family"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/list/best-pg-family-film-movies/reference',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Family.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(FamilySpider)
# process.start()
# print('Family Extracted!')

In [25]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class HistorySpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "History"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/list/best-pg-13-history-movies/reference',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'History.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(HistorySpider)
# process.start()
# print('History Extracted!')

In [26]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class RomanceSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Romance"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/list/romance-film-movies-and-films/reference',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Romance.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(RomanceSpider)
# process.start()
# print('Romance Extracted!')

In [27]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class SciFiSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "SciFi"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/list/classic-science-fiction-movies/ranker-film',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'SciFi.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(SciFiSpider)
# process.start()
# print('Sci-Fi Extracted!')

In [28]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class ThrillerSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Thriller"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/list/best-pg-thriller-movies/reference',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Thriller.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(ThrillerSpider)
# process.start()
# print('Thriller Extracted!')

In [29]:
# #Running scrapers to collect movie titles by genre
# # Importing in each cell because of the kernel restarts.
# import scrapy
# import re
# from scrapy.crawler import CrawlerProcess

# class WesternSpider(scrapy.Spider):
#     # Naming the spider is important if you are running more than one spider of
#     # this class simultaneously.
#     name = "Western"
    
#     # URL(s) to start with.
#     start_urls = [
#         'https://www.ranker.com/crowdranked-list/the-best-western-movies-ever-made',
#     ]

#     # Use XPath to parse the response we get.
#     def parse(self, response):
        
#         # Iterate over every <article> element on the page.
#         for article in response.xpath('//h2'):
            
#             # Yield a dictionary with the values we want.
#             yield {
#                 'name': article.xpath('div[@class="listItem__data"]/a/text()').extract_first()
#             }

# # Tell the script how to run the crawler by passing in settings.
# # The new settings have to do with scraping etiquette.          
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',         # Store data in JSON format.
#     'FEED_URI': 'Western.json',       # Name our storage file.
#     'LOG_ENABLED': False,          # Turn off logging for now.
#     'ROBOTSTXT_OBEY': True,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True
    
# })

# # Start the crawler with our spider.
# process.crawl(WesternSpider)
# process.start()
# print('Western Extracted!')

In [30]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class CMSpider(scrapy.Spider):
#     name = "Movie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(comedy_title)):
#         item = str(comedy_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'CMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False,
#     # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(CMSpider)
# process.start()
# print('Comedy Data Extracted!')

In [31]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class HorMSpider(scrapy.Spider):
#     name = "HorrorMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(horror_title)):
#         item = str(horror_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'HorMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False   
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(HorMSpider)
# process.start()
# print('Horror Data Extracted!')

In [32]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class ActMSpider(scrapy.Spider):
#     name = "ActionMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(action_title)):
#         item = str(action_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'ActMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False  
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(ActMSpider)
# process.start()
# print('Action Data Extracted!')

In [33]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class CriMSpider(scrapy.Spider):
#     name = "CrimeMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(crime_title)):
#         item = str(crime_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'CriMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False   
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(CriMSpider)
# process.start()
# print('Crime Data Extracted!')

In [34]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class DraMSpider(scrapy.Spider):
#     name = "DramaMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(drama_title)):
#         item = str(drama_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'DraMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False    
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(DraMSpider)
# process.start()
# print('Drama Data Extracted!')

In [35]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class FamMSpider(scrapy.Spider):
#     name = "FamilyMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(family_title)):
#         item = str(family_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'FamMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False   
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(FamMSpider)
# process.start()
# print('Family Data Extracted!')

In [36]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class HisMSpider(scrapy.Spider):
#     name = "HistoryMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(history_title)):
#         item = str(history_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'HisMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False 
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(HisMSpider)
# process.start()
# print('History Data Extracted!')

In [37]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class RomMSpider(scrapy.Spider):
#     name = "RomanceMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(romance_title)):
#         item = str(romance_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'RomMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False    
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(RomMSpider)
# process.start()
# print('Romance Data Extracted!')

In [38]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class SFMSpider(scrapy.Spider):
#     name = "SciFiMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(scifi_title)):
#         item = str(scifi_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'SFMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False   
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(SFMSpider)
# process.start()
# print('Sci-Fi Data Extracted!')

In [39]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class ThrMSpider(scrapy.Spider):
#     name = "ThrillerMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(thriller_title)):
#         item = str(thriller_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'ThrMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(ThrMSpider)
# process.start()
# print('Thriller Data Extracted!')

In [40]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class WesMSpider(scrapy.Spider):
#     name = "WesternMovie"
#     start_urls=[]
    
#     # Initiating Start URLs
#     for i in range(len(western_title)):
#         item = str(western_title.name.loc[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'WesMovies.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False, 
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(WesMSpider)
# process.start()
# print('Western Data Extracted!')

In [41]:
# # Scraping Comedy Data
# import scrapy
# from scrapy.crawler import CrawlerProcess


# class InputSpider(scrapy.Spider):
#     name = "MovieInput"
#     start_urls=[]
    
#     input_titles = ['Batman', 'Lego Movie','Avengers']
    
#     # Initiating Start URLs
#     for i in range(len(input_titles)):
#         item = str(input_titles[i])
#         start_urls.append('http://www.omdbapi.com/?apikey=236b0bfb&r=xml&plot=full&t={}'.format(item))

#     # Identifying the information we want from the query response and extracting it using xpath.
#     def parse(self, response):
#         for item in response.xpath('//movie'):
#             # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
#             yield {
#                     'Title': item.xpath('@title').extract_first(),
#                 'Year': item.xpath('@year').extract_first(),
#                 'Genre': item.xpath('@genre').extract_first(),
#                 'Rating': item.xpath('@imdbRating').extract_first(),
#                 'Plot': item.xpath('@plot').extract_first()
                
#                     }
                 
    
# process = CrawlerProcess({
#     'FEED_FORMAT': 'json',
#     'FEED_URI': 'MovieInput.json',
#     # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
#     'ROBOTSTXT_OBEY': False,
#     'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
#     'AUTOTHROTTLE_ENABLED': True,
#     'HTTPCACHE_ENABLED': True,
#     'LOG_ENABLED': False, 
# })
                                         

# # Starting the crawler with our spider.
# process.crawl(InputSpider)
# process.start()
# print('Movie Input Extracted!')