In [1]:
#This file uses the OMDB API to collect ratings and other information about the film-list previously compiled from Wikipedia.
#It also cleans these data and uses them to generate new features appropriate for statistical analysis.

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
#Import the film list from Wikipedia. To ensure successful use of the API, I updated the 'all_movies.csv' file for a number of 
#films where the year or title differed between the listing on Wikipedia and the movie's OMDB entry. This file is saved as 'updated_movies.csv.'

updated_movies = pd.read_csv('data_files/updated_movies.csv', index_col=0)
print updated_movies.info()
updated_movies.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1066 entries, 0 to 1065
Data columns (total 3 columns):
title       1066 non-null object
year        1066 non-null int64
director    1066 non-null object
dtypes: int64(1), object(2)
memory usage: 33.3+ KB
None


Unnamed: 0,title,year,director
0,12,2007,Nikita Mikhalkov
1,13,2010,Géla Babluani
2,13 Assassins,2010,Takashi Miike
3,The 13th Letter,1951,Otto Preminger
4,101 Dalmatians,1996,Stephen Herek


In [3]:
#Get the data

import json
def get_omdb(df):
    """Function to collect & store data from OMDB API"""
    response_dict = []
    s = df['year'].values
    for i,row in enumerate(df['title']):
        r = row.split()
        s_yr = s[i]
        r = '+'.join(r)
        url = "http://www.omdbapi.com/?t={}&y={}&tomatoes=True".format(r,s_yr)
        r = requests.get(url)
        if r.status_code == 200:
            response_dict.append(json.loads(r.text))
        else:
             continue
    return response_dict

In [4]:
omdb_dict = get_omdb(updated_movies)

In [5]:
#Make into a dataframe and see how many worked vs. didn't
omdb_df = pd.DataFrame(omdb_dict)
omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1066 entries, 0 to 1065
Data columns (total 36 columns):
Actors               1058 non-null object
Awards               1058 non-null object
BoxOffice            1058 non-null object
Country              1058 non-null object
DVD                  1058 non-null object
Director             1058 non-null object
Error                8 non-null object
Genre                1058 non-null object
Language             1058 non-null object
Metascore            1058 non-null object
Plot                 1058 non-null object
Poster               1058 non-null object
Production           1058 non-null object
Rated                1058 non-null object
Released             1058 non-null object
Response             1066 non-null object
Runtime              1058 non-null object
Title                1058 non-null object
Type                 1058 non-null object
Website              1058 non-null object
Writer               1058 non-null object
Year          

In [6]:
#Check which films returned errors / weren't found by pinging the API
errors = omdb_df[omdb_df.Error.isnull() == False]
errors

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Error,Genre,Language,Metascore,...,tomatoFresh,tomatoImage,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoURL,tomatoUserMeter,tomatoUserRating,tomatoUserReviews
127,,,,,,,Movie not found!,,,,...,,,,,,,,,,
265,,,,,,,Must provide more than one character.,,,,...,,,,,,,,,,
400,,,,,,,Movie not found!,,,,...,,,,,,,,,,
451,,,,,,,Movie not found!,,,,...,,,,,,,,,,
674,,,,,,,Movie not found!,,,,...,,,,,,,,,,
796,,,,,,,Movie not found!,,,,...,,,,,,,,,,
798,,,,,,,Must provide more than one character.,,,,...,,,,,,,,,,
1028,,,,,,,Movie not found!,,,,...,,,,,,,,,,


In [7]:
#Which films were these?
print updated_movies[127:128]
print updated_movies[265:266]
print updated_movies[400:401]
print updated_movies[451:452]
print updated_movies[674:675]
print updated_movies[796:797]
print updated_movies[798:799]
print updated_movies[1028:1029]

                title  year      director
127  Ellam Avan Seyal  2008  Shaji Kailas
    title  year      director
265    M   1951  Joseph Losey
                 title  year   director
400  Runaway Daughters  1994  Joe Dante
         title  year       director
451  Suspicion  1987  Andrew Grieve
        title  year     director
674  The Firm  1988  Alan Clarke
                 title  year    director
796  Can’t Buy Me Love  1987  Steve Rash
    title  year    director
798    M   1931  Fritz Lang
         title  year       director
1028  Van Gogh  1947  Alain Resnais


In [8]:
#Some of these had OMDB entries that just didn't work, but could be copied from the API directly. I add back in 5 of these missing entries:
omdb_dict[265] = {"Title":"M","Year":"1951","Rated":"N/A","Released":"01 Mar 1951","Runtime":"88 min","Genre":"Drama, Film-Noir, Thriller","Director":"Joseph Losey","Writer":"Norman Reilly Raine (screen play by), Leo Katcher (screen play by), Waldo Salt (additional dialogue by)","Actors":"David Wayne, Howard Da Silva, Martin Gabel, Luther Adler","Plot":"In this Americanization of the 1931 German thriller, both the police and the criminal underworld stalk a mysterious killer who preys on small children.","Language":"English","Country":"USA","Awards":"N/A","Poster":"http://ia.media-imdb.com/images/M/MV5BYjc1ZGYzZjMtZTkxNi00MDkxLTk5MGMtNmI2ZWE2OTkxZGU2XkEyXkFqcGdeQXVyMjUxODE0MDY@._V1_SX300.jpg","Metascore":"N/A","imdbRating":"6.9","imdbVotes":"891","imdbID":"tt0043766","Type":"movie","tomatoMeter":"N/A","tomatoImage":"N/A","tomatoRating":"N/A","tomatoReviews":"N/A","tomatoFresh":"N/A","tomatoRotten":"N/A","tomatoConsensus":"N/A","tomatoUserMeter":"46","tomatoUserRating":"3.1","tomatoUserReviews":"56","tomatoURL":"http://www.rottentomatoes.com/m/m-1951/","DVD":"N/A","BoxOffice":"N/A","Production":"Columbia Pictures","Website":"N/A","Response":"True"}
omdb_dict[451] = {"Title":"Suspicion","Year":"1988","Rated":"N/A","Released":"20 Apr 1988","Season":"7","Episode":"11","Runtime":"96 min","Genre":"Comedy, Drama","Director":"Andrew Grieve","Writer":"Anthony Berkeley (based on the play), Anthony Berkeley (novel), Joan Harrison, Barry Levinson, Jonathan Lynn, Samson Raphaelson, Alma Reville","Actors":"Anthony Andrews, Jane Curtin, Ron Pember, Martin Clunes","Plot":"A remake of Hitchcock's 1941 suspenser about a wealthy young woman who comes to suspect that her new husband is plotting to murder her.","Language":"English","Country":"UK","Awards":"N/A","Poster":"http://ia.media-imdb.com/images/M/MV5BMTQzMDIyMDEwMF5BMl5BanBnXkFtZTgwMDExMTg5MjE@._V1_SX300.jpg","Metascore":"N/A","imdbRating":"6.0","imdbVotes":"73","imdbID":"tt0094083","seriesID":"tt0176357","Type":"episode","tomatoMeter":"N/A","tomatoImage":"N/A","tomatoRating":"N/A","tomatoReviews":"N/A","tomatoFresh":"N/A","tomatoRotten":"N/A","tomatoConsensus":"N/A","tomatoUserMeter":"N/A","tomatoUserRating":"N/A","tomatoUserReviews":"N/A","tomatoURL":"N/A","DVD":"N/A","BoxOffice":"N/A","Production":"N/A","Website":"N/A","Response":"True"}
omdb_dict[674] = {"Title":"The Firm","Year":"1989","Rated":"N/A","Released":"26 Feb 1989","Season":"5","Episode":"8","Runtime":"70 min","Genre":"N/A","Director":"Alan Clarke","Writer":"Al Ashton","Actors":"Gary Oldman, Lesley Manville, Philip Davis, Andrew Wilde","Plot":"This is the story of rival \"Firms\" of football supporters, and how one man has a wish to team them up for the European Championships of 1988. However, when this is discussed, the opposing ...","Language":"English","Country":"UK","Awards":"N/A","Poster":"http://ia.media-imdb.com/images/M/MV5BMTQyNzc1ODA5NV5BMl5BanBnXkFtZTcwNzc2MTIzMQ@@._V1_SX300.jpg","Metascore":"N/A","imdbRating":"7.3","imdbVotes":"2410","imdbID":"tt0095158","seriesID":"tt0297626","Type":"episode","tomatoMeter":"N/A","tomatoImage":"N/A","tomatoRating":"N/A","tomatoReviews":"N/A","tomatoFresh":"N/A","tomatoRotten":"N/A","tomatoConsensus":"N/A","tomatoUserMeter":"81","tomatoUserRating":"3.8","tomatoUserReviews":"3165","tomatoURL":"http://www.rottentomatoes.com/m/10005638-firm/","DVD":"N/A","BoxOffice":"N/A","Production":"N/A","Website":"N/A","Response":"True"}
omdb_dict[796] = {"Title":"Can't Buy Me Love","Year":"1987","Rated":"PG-13","Released":"14 Aug 1987","Runtime":"94 min","Genre":"Comedy, Drama, Romance","Director":"Steve Rash","Writer":"Michael Swerdlick","Actors":"Patrick Dempsey, Amanda Peterson, Courtney Gains, Tina Caspary","Plot":"A nerdy outcast secretly pays the most popular girl in school one thousand dollars to be his girlfriend.","Language":"English","Country":"USA","Awards":"1 win & 3 nominations.","Poster":"http://ia.media-imdb.com/images/M/MV5BMTk1NjIxNTYxN15BMl5BanBnXkFtZTYwMzczNzI5._V1_SX300.jpg","Metascore":"N/A","imdbRating":"6.7","imdbVotes":"23,925","imdbID":"tt0092718","Type":"movie","tomatoMeter":"48","tomatoImage":"rotten","tomatoRating":"5.0","tomatoReviews":"21","tomatoFresh":"10","tomatoRotten":"11","tomatoConsensus":"N/A","tomatoUserMeter":"74","tomatoUserRating":"3.3","tomatoUserReviews":"48007","tomatoURL":"http://www.rottentomatoes.com/m/cant_buy_me_love/","DVD":"13 Aug 2002","BoxOffice":"N/A","Production":"Buena Vista Pictures","Website":"N/A","Response":"True"}
omdb_dict[798] = {"Title":"M","Year":"1931","Rated":"NOT RATED","Released":"31 Aug 1931","Runtime":"99 min","Genre":"Crime, Drama, Mystery","Director":"Fritz Lang","Writer":"Thea von Harbou (script), Fritz Lang (script)","Actors":"Peter Lorre, Ellen Widmann, Inge Landgut, Otto Wernicke","Plot":"When the police in a German city are unable to catch a child-murderer, other criminals join in the manhunt.","Language":"German","Country":"Germany","Awards":"2 wins.","Poster":"http://ia.media-imdb.com/images/M/MV5BMTQyNjA5NzU5MV5BMl5BanBnXkFtZTgwMDk1MTA5MTE@._V1_SX300.jpg","Metascore":"N/A","imdbRating":"8.4","imdbVotes":"99,611","imdbID":"tt0022100","Type":"movie","tomatoMeter":"100","tomatoImage":"certified","tomatoRating":"9.2","tomatoReviews":"51","tomatoFresh":"51","tomatoRotten":"0","tomatoConsensus":"A landmark psychological thriller with arresting images, deep thoughts on modern society, and Peter Lorre in his finest performance.","tomatoUserMeter":"95","tomatoUserRating":"4.4","tomatoUserReviews":"35424","tomatoURL":"http://www.rottentomatoes.com/m/1012928-m/","DVD":"20 Oct 1998","BoxOffice":"N/A","Production":"Foremco","Website":"N/A","Response":"True"}

In [9]:
#Update omdb_df to incorporate these new entries
omdb_df = pd.DataFrame(omdb_dict)
omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1066 entries, 0 to 1065
Data columns (total 39 columns):
Actors               1063 non-null object
Awards               1063 non-null object
BoxOffice            1063 non-null object
Country              1063 non-null object
DVD                  1063 non-null object
Director             1063 non-null object
Episode              2 non-null object
Error                3 non-null object
Genre                1063 non-null object
Language             1063 non-null object
Metascore            1063 non-null object
Plot                 1063 non-null object
Poster               1063 non-null object
Production           1063 non-null object
Rated                1063 non-null object
Released             1063 non-null object
Response             1066 non-null object
Runtime              1063 non-null object
Season               2 non-null object
Title                1063 non-null object
Type                 1063 non-null object
Website             

In [10]:
#Drop 8 rows for the 3 remaining movies for which OMDB has no info (127, 400, 1028)
#plus 5 more for which the original-remake pair doesn't include 2 actual movies (41, 495, 574, 660, 933)
omdb_df = omdb_df.drop(omdb_df.index[[41, 127, 400, 495, 574, 660, 933, 1028]]).reset_index()
omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058 entries, 0 to 1057
Data columns (total 40 columns):
index                1058 non-null int64
Actors               1058 non-null object
Awards               1058 non-null object
BoxOffice            1058 non-null object
Country              1058 non-null object
DVD                  1058 non-null object
Director             1058 non-null object
Episode              2 non-null object
Error                0 non-null object
Genre                1058 non-null object
Language             1058 non-null object
Metascore            1058 non-null object
Plot                 1058 non-null object
Poster               1058 non-null object
Production           1058 non-null object
Rated                1058 non-null object
Released             1058 non-null object
Response             1058 non-null object
Runtime              1058 non-null object
Season               2 non-null object
Title                1058 non-null object
Type                 

In [11]:
#Keep a subset of most broadly available & potentially relevant variables
omdb_df = omdb_df[['Title', 'Year', 'tomatoMeter', 'tomatoRating', 'tomatoReviews', 'tomatoUserMeter', 'tomatoUserRating', 'tomatoUserReviews', 'imdbRating', 'imdbVotes', 'imdbID', 'Awards', 'Genre', 'Language', 'Country', 'Director', 'Rated', 'Runtime', 'Actors', 'Writer']]
omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058 entries, 0 to 1057
Data columns (total 20 columns):
Title                1058 non-null object
Year                 1058 non-null object
tomatoMeter          1058 non-null object
tomatoRating         1058 non-null object
tomatoReviews        1058 non-null object
tomatoUserMeter      1058 non-null object
tomatoUserRating     1058 non-null object
tomatoUserReviews    1058 non-null object
imdbRating           1058 non-null object
imdbVotes            1058 non-null object
imdbID               1058 non-null object
Awards               1058 non-null object
Genre                1058 non-null object
Language             1058 non-null object
Country              1058 non-null object
Director             1058 non-null object
Rated                1058 non-null object
Runtime              1058 non-null object
Actors               1058 non-null object
Writer               1058 non-null object
dtypes: object(20)
memory usage: 165.4+ KB


In [12]:
#Clean the data and generate features for analysis

#1. Make sure everything that should be a number is one.

#Turn Year and imdbVotes and Runtime into integers
def make_int(cell):
    """Function to replace strings with integers where relevant,
    and to replace empty string and 'N/A' with NaN"""
    cell = str(cell)
    if cell == "" or cell == "N/A" or cell == "nan":
        return np.nan
    else:
        return int(''.join(c for c in cell if c.isdigit())) 

#Apply make_int:    
map_to_int = ['Year', 'Runtime', 'imdbVotes', 'tomatoMeter', 'tomatoReviews', 'tomatoUserMeter', 'tomatoUserReviews']
for var in map_to_int:
    omdb_df[var] = map(make_int, omdb_df[var])    
    
    
#Turn imdbRating into floating point values
def make_float(cell):
    """Function to replace strings with floating point numbers where relevant,
    and to replace empty string and 'N/A' with NaN"""
    cell = str(cell)
    if cell == "" or cell == "N/A" or cell == "nan":
        return np.nan
    else:
        return float(cell)   

#Apply make_float:
map_to_float = ['imdbRating', 'tomatoRating', 'tomatoUserRating'] 
for var in map_to_float:
    omdb_df[var] = map(make_float, omdb_df[var])   

In [13]:
#2. Generate some awards-related variables
#Examine awards column
omdb_df['Awards']

0       Nominated for 1 Oscar. Another 5 wins & 5 nomi...
1                                                  1 win.
2                               14 wins & 27 nominations.
3                                                     N/A
4       Nominated for 1 Golden Globe. Another 3 wins &...
5                                           1 nomination.
6       Nominated for 2 Oscars. Another 3 wins & 30 no...
7                                                     N/A
8                                           1 nomination.
9                                                     N/A
10      Nominated for 4 Oscars. Another 2 wins & 2 nom...
11      Nominated for 1 Oscar. Another 2 wins & 2 nomi...
12      Won 1 Golden Globe. Another 5 wins & 7 nominat...
13          Nominated for 4 Oscars. Another 1 nomination.
14                                         2 nominations.
15                                2 wins & 4 nominations.
16                                          1 nomination.
17            

In [14]:
#Define 3 new variables: awards, nominations, and awards + noms.

import re

def awards(column):
    """Function to sum total awards, nominations, and awards + nominations for a film"""
    noms = []
    awards = []
    noms_awards = []
    for cell in column:
        cell = str(cell)
        
        #Nominations
        nominatedfor = re.search(r'for (\d*)', cell)
        nominatedfor = int(0 if nominatedfor is None else nominatedfor.group(1))
        
        more_nominations = re.search(r'(\d*) nomination', cell)
        more_nominations = int(0 if more_nominations is None else more_nominations.group(1))
        
        all_nominations = nominatedfor + more_nominations

        #Awards
        won = re.search(r'Won (\d*)', cell)
        won = int(0 if won is None else won.group(1))
        
        wins = re.search(r'(\d*) win', cell)
        wins = int(0 if wins is None else wins.group(1))
        
        all_awards = won + wins
      
        # Nominations + Awards
        nom_plus_award = all_nominations + all_awards
        
        noms.append(all_nominations)
        awards.append(all_awards)
        noms_awards.append(nom_plus_award)
  
    return noms, awards, noms_awards        

In [15]:
#Add these variables to the dataframe
noms, awards, noms_awards = awards(omdb_df['Awards'])
omdb_df['Award_count'] = awards
omdb_df['Noms_count'] = noms
omdb_df['Noms+Awards'] = noms_awards

omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058 entries, 0 to 1057
Data columns (total 23 columns):
Title                1058 non-null object
Year                 1058 non-null int64
tomatoMeter          1058 non-null object
tomatoRating         1058 non-null object
tomatoReviews        1058 non-null object
tomatoUserMeter      1058 non-null object
tomatoUserRating     1058 non-null object
tomatoUserReviews    1058 non-null object
imdbRating           1058 non-null object
imdbVotes            1058 non-null object
imdbID               1058 non-null object
Awards               1058 non-null object
Genre                1058 non-null object
Language             1058 non-null object
Country              1058 non-null object
Director             1058 non-null object
Rated                1058 non-null object
Runtime              1058 non-null object
Actors               1058 non-null object
Writer               1058 non-null object
Award_count          1058 non-null int64
Noms_count   

In [16]:
#3.  Some films have more than one language and country listed.  I want only one primary language and country per film.

def get_primary(column):
    """Function to identify the film's primary language or country from the one that is listed first"""
    primary = []
    for cell in column:
        cell = str(cell)
        group = cell.split(',')
        primary.append(group[0])
    return primary

In [17]:
#Add the new primary language and primary country variables
omdb_df['PrimaryLanguage'] = get_primary(omdb_df['Language'])
omdb_df['PrimaryCountry'] = get_primary(omdb_df['Country'])

In [18]:
#Some films have language missing: investigate this.
language_missing = omdb_df[omdb_df.PrimaryLanguage == "N/A"]
language_missing

Unnamed: 0,Title,Year,tomatoMeter,tomatoRating,tomatoReviews,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,imdbRating,imdbVotes,...,Director,Rated,Runtime,Actors,Writer,Award_count,Noms_count,Noms+Awards,PrimaryLanguage,PrimaryCountry
139,The Firm,2009,67.0,5.4,24.0,35.0,2.9,2141.0,5.8,4498.0,...,Nick Love,,90 min,"Paul Anderson, Calum MacNab, Daniel Mays, Doug...","Al Ashton (original screenplay), Nick Love (ad...",0,0,0,,UK
150,Forbidden Fruit,1921,,,,,,,7.2,53.0,...,Cecil B. DeMille,,87 min,"Agnes Ayres, Clarence Burton, Theodore Roberts...","Cecil B. DeMille (story ""The Golden Chance""), ...",0,0,0,,USA
192,Hoodman Blind,1923,,,,,,,3.4,18.0,...,John Ford,,60 min,"David Butler, Gladys Hulette, Regina Connelly,...","Wilson Barrett (play), Henry Arthur Jones (pla...",0,0,0,,USA
472,To Have and to Hold,1922,,,,,,,,,...,George Fitzmaurice,,,"Betty Compson, Bert Lytell, Theodore Kosloff, ...","Ouida Bergère (adaptation), Mary Johnston (novel)",0,0,0,,USA
660,A Trip to the Moon,1902,100.0,9.4,8.0,90.0,4.2,6748.0,8.2,30017.0,...,Georges Méliès,TV-G,13 min,"François Lallement, Jules-Eugène Legris",,0,0,0,,France
663,Diversion,1980,,,,,,,6.5,21.0,...,James Dearden,,50 min,"Stephen Moore, Cherie Lunghi, Morag Hood, Ned ...",James Dearden,0,1,1,,UK
710,The Phantom City,1928,,,,,,,,,...,Albert S. Rogell,,,"Ken Maynard, Eugenia Gilbert, Jim Mason, Charl...","Adele Buffington (scenario), Fred Allen (titles)",0,0,0,,USA
794,The Hands of Orlac,1924,86.0,7.4,7.0,83.0,3.7,339.0,7.0,1268.0,...,Robert Wiene,,92 min,"Conrad Veidt, Alexandra Sorina, Fritz Kortner,...","Louis Nerz, Maurice Renard (novel)",0,0,0,,Germany
809,London After Midnight,1927,,,,39.0,3.5,735.0,7.0,979.0,...,Tod Browning,UNRATED,69 min,"Lon Chaney, Marceline Day, Henry B. Walthall, ...","Tod Browning (story ""The Hypnotist""), Waldemar...",0,0,0,,USA
820,The Miracle Man,1919,,,,,,,5.8,119.0,...,George Loane Tucker,NOT RATED,34 min,"Lon Chaney, Betty Compson, Joseph J. Dowling, ...","George M. Cohan (play), Robert Hobart Davis (n...",0,0,0,,USA


In [19]:
#Infer primary language from Country -- in all but two cases it's English (some are silent, but I think this still makes sense).

def fix_lang(column):
    """Function to fix PrimaryLanguage with 'English' where it is missing"""
    lang_fix = []
    for cell in column:
        cell = str(cell)
        if not cell == "N/A":
            lang_fix.append(cell)
        elif cell == "N/A":  
            lang_fix.append('English')
    return lang_fix

In [20]:
#Apply the function and manually fix the two entries that aren't supposed to be English

omdb_df['PrimaryLangFixed'] = fix_lang(omdb_df['PrimaryLanguage'])
omdb_df.iloc[660, -1:] = "French"
omdb_df.iloc[794, -1:] = "German"

In [22]:
# 5. The Genre variable is a list of all the genres listed for a given film.  We really need indicator variables for each major genre.  
#I found 19 of these: Crime, Drama, Thriller, Action, Adventure, Film-Noir, Mystery, Comedy, Family, Horror, 
# Sci-Fi, Romance, Musical, Fantasy, Western, War, Sport, History, Biography

genre_list = ['Crime', 'Drama', 'Thriller', 'Action', 'Adventure', 'Film-Noir', 'Mystery', 'Comedy', 'Family', 
              'Horror', 'Sci-Fi', 'Romance', 'Music', 'Fantasy', 'Western', 'War', 'Sport', 'History', 'Biography']

def make_genres(column, genres):
    """Function takes a dataframe column (each cell is a string containing genres for that film) and a list of possible genres,
    and returns output of 0 or 1 for each possible genre, according to whether it is in the cell"""
    
    genre_results = [[] for genre in genres]

    for cell in column:
        cell = str(cell)
        for i, genre in enumerate(genres):
            if re.search(genre, cell):
                genre_results[i].append(1)
            else:
                genre_results[i].append(0)
                
    return genre_results

In [24]:
#Apply the function to create new genre columns in omdb_df
for i, genre in enumerate(genre_list):
    omdb_df[genre] = make_genres(omdb_df['Genre'], genre_list)[i]

In [26]:
#Next, combine the pairs of movies
remakes = omdb_df[0:529].reset_index(drop=True)
originals = omdb_df[529:].reset_index(drop=True)

In [27]:
#Merge 
movie_pairs = originals.merge(remakes, left_index=True, right_index=True, suffixes=['_orig', '_remake'])

In [28]:
#Create some new variables based on the relationship between the remake and original

#1. Indicator for whether the two films are in the same language
movie_pairs['SameLang'] = (movie_pairs['PrimaryLangFixed_orig'] == movie_pairs['PrimaryLangFixed_remake']).astype(int)

In [29]:
#2. Indicator for whether the two films are from the same country
movie_pairs['SameCountry'] = (movie_pairs['PrimaryCountry_orig'] == movie_pairs['PrimaryCountry_remake']).astype(int)

In [30]:
#3. Number of years between the remake and original
movie_pairs['YearDiff'] = (movie_pairs['Year_remake'] - movie_pairs['Year_orig'])

In [32]:
#Can now get rid of some variables that won't be useful going forward: no longer need some of the variables from which I built 
# new features; just need genre of the original, since genre of the remake won't tell us anything about the type of film 
# it makes sense to remake.  So, generate a database for analysis that keeps just these variables:

analysis_pairs = movie_pairs[['Title_orig', 'Year_orig', 'imdbRating_orig', 'imdbVotes_orig', 'Runtime_orig', 'Award_count_orig', 
                              'Noms_count_orig', 'Noms+Awards_orig', 'PrimaryCountry_orig', 'PrimaryLangFixed_orig', 'Crime_orig', 
                              'Drama_orig', 'Thriller_orig', 'Action_orig', 'Adventure_orig', 'Film-Noir_orig', 'Mystery_orig', 
                              'Comedy_orig', 'Family_orig', 'Horror_orig', 'Sci-Fi_orig', 'Romance_orig', 'Music_orig', 
                              'Fantasy_orig', 'Western_orig', 'War_orig', 'Sport_orig', 'History_orig', 'Biography_orig', 
                              'Title_remake', 'Year_remake', 'imdbRating_remake', 'imdbVotes_remake', 'Runtime_remake', 
                              'Award_count_remake', 'Noms_count_remake', 'Noms+Awards_remake', 'PrimaryCountry_remake', 
                              'PrimaryLangFixed_remake', 'Crime_remake', 'Drama_remake', 'Thriller_remake', 'Action_remake', 
                              'Adventure_remake', 'Film_Noir_remake', 'Mystery_remake', 'Comedy_remake', 'Family_remake', 
                              'Horror_remake', 'Sci_Fi_remake', 'Romance_remake', 'Music_remake', 'Fantasy_remake', 
                              'Western_remake', 'War_remake', 'Sport_remake', 'History_remake', 'Biography_remake', 'SameLang', 
                              'SameCountry', 'YearDiff']]

In [34]:
#Save this dataframe to a CSV
analysis_pairs.to_csv('data_files/paired_movies.csv', encoding='utf-8')