# Augment the Data

Let us augment the items(movie) data with more fields from IMDB and / or TMDB data

In [32]:
import numpy as np
import pandas as pd
import json, glob
import requests
import re

In [33]:
items_raw = pd.read_csv("data/items_raw.csv")

In [34]:
items_raw.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Refining the Data

- Drop the `imdb_url` (no longer valid) and `video_release_date` (mostly null) fields
- Get `year` from the `release_date`
- Remove `(year)` from the `title`

*1. Drop `imdb_url` and `video_release_date`*

In [35]:
items_refine = items_raw.drop(['video_release_date', 'imdb_url'], axis=1).copy()

*2. Get `year` from `release_title`* 

In [36]:
items_refine["release_date"] = pd.to_datetime(items_refine.release_date,infer_datetime_format=True)
items_refine["year"] = items_refine.release_date.apply(lambda x: str(x.year))

*3. Remove `(year)` from the `title`*

In [37]:
# Match Whitespace + ( + YEAR + )
regex_year = re.compile(r'\s\(\d{4}\)')

In [40]:
items_refine["movie"] = items_refine.title.str.replace(regex_year, "")
items_refine["movie"] = items_refine.movie.str.strip()

In [41]:
items_refine.head()

Unnamed: 0,movie_id,title,release_date,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western,year,movie
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1995,Toy Story
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1995,GoldenEye
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1995,Four Rooms
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1995,Get Shorty
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1995,Copycat


## Get the Movie ID from IMDB

We will use the scraper from https://github.com/babu-thomas/movielens-posters to get the IMDB title link

In [42]:
imdb_link_url = "https://raw.githubusercontent.com/babu-thomas/movielens-posters/master/movie_url.csv"

In [43]:
def get_imdb(imdb_link_url):
    imdb = pd.read_csv(imdb_link_url, header=None, names=["url"] )
    imdb["movie_id"] = imdb.index
    imdb["imdb_id"] = imdb.url.str.split("/").str[4]
    imdb.drop(["url"], axis=1, inplace=True)
    imdb.to_csv("data/imdb.csv", index=False)

In [44]:
# Uncomment and Run this to get the imdb title numbers
get_imdb(imdb_link_url)

In [45]:
imdb = pd.read_csv("data/imdb.csv")

In [46]:
imdb.head()

Unnamed: 0,movie_id,imdb_id
0,1,tt0114709
1,2,tt0113189
2,3,tt0113101
3,4,tt0113161
4,5,tt0112722


In [47]:
imdb.shape

(1640, 2)

In [48]:
len(imdb.imdb_id.unique())

1611

## Get Movie Feature from TMDB 

In [51]:
tmdb_API_KEY = "4f23dc5304423b2d509195a634e7a702"

In [52]:
movies = items_refine[["movie_id","movie", "year"]]

In [53]:
movies

Unnamed: 0,movie_id,movie,year
0,1,Toy Story,1995
1,2,GoldenEye,1995
2,3,Four Rooms,1995
3,4,Get Shorty,1995
4,5,Copycat,1995
...,...,...,...
1677,1678,Mat' i syn,1998
1678,1679,B. Monkey,1998
1679,1680,Sliding Doors,1998
1680,1681,You So Crazy,1994


In [55]:
get_movie_data("Shanghai Triad", "1995")

{'popularity': 5.899,
 'id': 37557,
 'video': False,
 'vote_count': 31,
 'vote_average': 6.9,
 'title': 'Shanghai Triad',
 'release_date': '1995-12-22',
 'original_language': 'zh',
 'original_title': '摇啊摇，摇到外婆桥',
 'genre_ids': [18, 80],
 'backdrop_path': '/n78lIMVBMhZT7nMvEwxUcgGaPkh.jpg',
 'adult': False,
 'overview': 'Shanghai, China, 1930. When young Shuisheng arrives from the countryside, his uncle Liushu puts him at the service of Bijou, the mistress of Laoda, supreme boss of the Tang Triad, constantly threatened by his enemies, both those he knows and those lurking in the shadows.',
 'poster_path': '/qcoOCoN7viOhboGwhYXyApdDuiq.jpg'}

In [57]:
#items_refine["tmdb_id"] = items_refine.apply(lambda x: get_movie_id(items_refine.name, items_refine.year)) 

In [59]:
#items_refine.tmdb_id.isnull().sum()

In [60]:
def get_movie_features(movie_title):
    url = "https://api.themoviedb.org/3/movie/" + movie_title
    payload = {'api_key': tmdb_API_KEY, "language": "en_US"}
    response = requests.request("GET", url, params=payload)
    if response.status_code == 200:
        return response.json()
    else: 
        return False

In [61]:
imdb.imdb_id[0]

'tt0114709'

In [62]:
get_movie_features(imdb.imdb_id[0])

{'adult': False,
 'backdrop_path': '/dji4Fm0gCDVb9DQQMRvAI8YNnTz.jpg',
 'belongs_to_collection': {'id': 10194,
  'name': 'Toy Story Collection',
  'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
  'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'},
 'budget': 30000000,
 'genres': [{'id': 16, 'name': 'Animation'},
  {'id': 35, 'name': 'Comedy'},
  {'id': 10751, 'name': 'Family'}],
 'homepage': 'http://toystory.disney.com/toy-story',
 'id': 862,
 'imdb_id': 'tt0114709',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': 27.644,
 'poster_path': '/rhIRbceoE9lR4veEXuwCC2wARtG.jpg',
 'production_companies': [{'id': 3,
   'logo_path': '/1TjvGVDMYsj6J

### Create Feature Directory

In [63]:
import os
current_path = os.getcwd()
current_path

'/Users/amitkaps/Documents/github/recommendation/MovieLens'

In [64]:
# define the name of the directory to be created
feature_dir = "/data/features"
feature_dir= current_path + feature_dir

In [66]:
# Check if feature dir exists
if os.path.exists(feature_dir):
    print("Directory already exists %s" % feature_dir)
else:     
    try:
        os.mkdir(data_dir)
    except OSError:
        print("Creation of the directory %s failed" % feature_dir)
    else:
        print("Successfully created directory %s" % feature_dir)

Directory already exists /Users/amitkaps/Documents/github/recommendation/MovieLens/data/features


### Get the movie features from tmdb

In [67]:
def save_movie_list_features():
    
    id_list = imdb.imdb_id.tolist()
    num = 40
    num_loops = int(len(title_list)/ num)    
    
    for i in range(num_loops):
        print("loop " + str(i))
        movie_features = []
        for j in range(num):
            index = i * num + j
            imdb_id = imdb.imdb_id[index]
            #print(index, title)
            features = get_movie_features(imdb_id)
            if features != False: 
                movie_features.append(features)
            
        outfile = "data/features/movie-features-" + str(i) + ".json"
        with open(outfile, 'w') as fout:
            json.dump(movie_features, fout)
    
    print("Finished")

In [68]:
# Run this to download and save the movie features from tmdb

# save_movie_list_features()

### Create single movie feature files

In [69]:
def create_features_dataframe():
    # Create a dataframe to get the column names
    with open('data/features/movie-features-0.json') as json_file:
        data = json.load(json_file)
        df = pd.DataFrame.from_dict(data)
    
    # Create an empty feature dataframe
    features_raw = df[0:0].copy()
    
    # Read each json and concat the features
    for f in glob.glob("data/features/*.json"):
        with open(f, "rb") as json_file:
            data = json.load(json_file)
            df = pd.DataFrame.from_dict(data)
            features_raw = pd.concat([features_raw, df])
    
    # Return the concatened feature dataframe
    return features_raw

In [70]:
# Run this to create and save the feature csv

features_raw = create_features_dataframe()

In [71]:
# Drop Duplicates
features_raw_uniq = features_raw.drop_duplicates(subset=['imdb_id'], keep="first")

In [72]:
features = pd.merge(left=imdb, right=features_raw_uniq, how="left", on="imdb_id")
features.to_csv("data/features.csv", index=None)

In [73]:
features_raw.shape, features_raw_uniq.shape, features.shape

((1516, 25), (1490, 25), (1640, 26))

### Check on Movie Completeness

In [77]:
items_refine.head()

Unnamed: 0,movie_id,title,release_date,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western,year,movie
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1995,Toy Story
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1995,GoldenEye
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1995,Four Rooms
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1995,Get Shorty
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1995,Copycat


In [78]:
items_enriched = pd.merge(left= items_refine, right=features, how="left", on="movie_id")

In [80]:
items_refine.shape, items_enriched.shape

((1682, 24), (1682, 49))

In [81]:
items_enriched.head()

Unnamed: 0,movie_id,title_x,release_date_x,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,release_date_y,revenue,runtime,spoken_languages,status,tagline,title_y,video,vote_average,vote_count
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.9,10867.0
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,1995-11-16,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.8,2035.0
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,1995-12-09,4300000.0,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,6.1,1251.0
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,1995-10-20,115101622.0,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The mob is tough, but it’s nothing like show b...",Get Shorty,False,6.5,500.0
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,1995-10-27,0.0,124.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,One man is copying the most notorious killers ...,Copycat,False,6.5,424.0


In [83]:
items_enriched[["title_x", "title_y", "release_date_y"]]

Unnamed: 0,title_x,title_y,release_date_y
0,Toy Story (1995),Toy Story,1995-10-30
1,GoldenEye (1995),GoldenEye,1995-11-16
2,Four Rooms (1995),Four Rooms,1995-12-09
3,Get Shorty (1995),Get Shorty,1995-10-20
4,Copycat (1995),Copycat,1995-10-27
...,...,...,...
1677,Mat' i syn (1997),Mother and Son,1997-02-20
1678,B. Monkey (1998),B. Monkey,1998-11-07
1679,Sliding Doors (1998),Sliding Doors,1998-04-23
1680,You So Crazy (1994),Martin Lawrence: You So Crazy,1994-04-27


In [189]:
imdb.head()
links = pd.merge(right=imdb, left=items_enriched, on="movie_id", how="left")

In [190]:
links = links[["movie_id", "imdb_id_x", "imdb_id_y", "id"]]

In [191]:
links.head(10)

Unnamed: 0,movie_id,imdb_id_x,imdb_id_y,id
0,1,tt0114709,tt0114709,862.0
1,2,tt0113189,tt0113189,710.0
2,3,tt0113101,tt0113101,5.0
3,4,tt0113161,tt0113161,8012.0
4,5,tt0112722,tt0112722,1710.0
5,6,,,
6,7,tt0114746,tt0114746,63.0
7,8,tt0112431,tt0112431,9598.0
8,9,tt0112818,tt0112818,687.0
9,10,tt0114279,tt0114279,31174.0


In [192]:
links.isnull().sum()

movie_id       0
imdb_id_x     42
imdb_id_y     42
id           166
dtype: int64

In [193]:
check = items_enriched[items_enriched.title_y.isna()].copy()

In [194]:
test = check[["movie_id", "movie", "year"]].copy()

In [195]:
test.reset_index(inplace=True)

In [196]:
test.shape[0]

166

In [131]:
get_movie_data("Shanghai Triad", "1995")["id"]

37557

In [157]:
test.head()

Unnamed: 0,index,movie_id,movie,year
0,5,6,Shanghai Triad,1995
1,25,26,Brothers McMullen,1995
2,49,50,Star Wars,1977
3,54,55,Professional,1994
4,62,63,Santa Clause,1994


In [134]:
test.loc[0, "movie"] = "Shanghai Triad"

In [156]:
test["movie"] = test.movie.str.replace(",", "")
test["movie"] = test.movie.str.replace("The", "")

In [160]:
def get_movie_data(movie_name, year):
    url = "https://api.themoviedb.org/3/search/movie/"
    payload = {'api_key': tmdb_API_KEY, 
               "language": "en_US", 
               "query": movie_name, 
               "include_adult": False,
               "page": 1, 
               "primary_release_year": year}
    response = requests.request("GET", url, params=payload)
    if response.status_code == 200:
        json = response.json()
        #print(json)
        if json["results"] != []:
            data = json["results"][0] #Get the response from the first hit
            return data
        else:
            return {"id": 0}
    else: 
        return {"id": 0}

In [161]:
get_movie_data("Terminator 2: Judgment Day", "1992")

{'id': 0}

In [163]:
idx

[37557, 16388, 11, 101]

In [164]:
idx = []
for i in range(test.shape[0]):
    print(i)
    movie_name = test.loc[i, "movie"]
    year = test.loc[i, "year"]
    tmdb_id = get_movie_data(movie_name, year)["id"]
    idx.append(tmdb_id)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165


In [198]:
test.head()

Unnamed: 0,index,movie_id,movie,year
0,5,6,Shanghai Triad (Yao a yao yao dao waipo qiao),1995
1,25,26,"Brothers McMullen, The",1995
2,49,50,Star Wars,1977
3,54,55,"Professional, The",1994
4,62,63,"Santa Clause, The",1994


In [197]:
links[links.id.isnull()]

Unnamed: 0,movie_id,imdb_id_x,imdb_id_y,id
5,6,,,
25,26,tt6155832,tt6155832,
49,50,tt6855750,tt6855750,
54,55,tt0111942,tt0111942,
62,63,tt5267640,tt5267640,
...,...,...,...,...
1654,1655,tt4478446,tt4478446,
1655,1656,tt0536773,tt0536773,
1659,1660,,,
1666,1667,tt7282010,tt7282010,


In [199]:
links.head()

Unnamed: 0,movie_id,imdb_id_x,imdb_id_y,id
0,1,tt0114709,tt0114709,862.0
1,2,tt0113189,tt0113189,710.0
2,3,tt0113101,tt0113101,5.0
3,4,tt0113161,tt0113161,8012.0
4,5,tt0112722,tt0112722,1710.0


In [200]:
test["tmdb_id"] = np.array(idx)

In [213]:
new = pd.merge(left=links, right=test, on="movie_id", how="left")

In [214]:
new.head(6)

Unnamed: 0,movie_id,imdb_id_x,imdb_id_y,id,index,movie,year,tmdb_id
0,1,tt0114709,tt0114709,862.0,,,,
1,2,tt0113189,tt0113189,710.0,,,,
2,3,tt0113101,tt0113101,5.0,,,,
3,4,tt0113161,tt0113161,8012.0,,,,
4,5,tt0112722,tt0112722,1710.0,,,,
5,6,,,,5.0,Shanghai Triad (Yao a yao yao dao waipo qiao),1995.0,37557.0


In [215]:
new.id.fillna(0, inplace=True)
new.tmdb_id.fillna(0, inplace=True)

In [216]:
new["tmdb_id"] = new["id"] + new["tmdb_id"]
new["tmdb_id"] = new.tmdb_id.astype("int")

In [217]:
new.head(6)

Unnamed: 0,movie_id,imdb_id_x,imdb_id_y,id,index,movie,year,tmdb_id
0,1,tt0114709,tt0114709,862.0,,,,862
1,2,tt0113189,tt0113189,710.0,,,,710
2,3,tt0113101,tt0113101,5.0,,,,5
3,4,tt0113161,tt0113161,8012.0,,,,8012
4,5,tt0112722,tt0112722,1710.0,,,,1710
5,6,,,0.0,5.0,Shanghai Triad (Yao a yao yao dao waipo qiao),1995.0,37557


In [218]:
new.drop(["id", "index", "movie", "year"], axis=1, inplace=True)

In [220]:
new.drop(["imdb_id_x"], axis=1, inplace=True)

In [222]:
new.columns = ["movie_id", "imdb_id", "tmdb_id"]

In [223]:
new.head()

Unnamed: 0,movie_id,imdb_id,tmdb_id
0,1,tt0114709,862
1,2,tt0113189,710
2,3,tt0113101,5
3,4,tt0113161,8012
4,5,tt0112722,1710


In [224]:
new.to_csv("links.csv", index=None)

In [178]:
pd.lookup?

Object `pd.lookup` not found.


In [165]:
idx

[37557,
 16388,
 11,
 101,
 11395,
 9495,
 11934,
 1245,
 280,
 274,
 0,
 10779,
 630,
 0,
 1891,
 0,
 0,
 525,
 0,
 9549,
 218,
 0,
 199,
 0,
 0,
 0,
 1645,
 11975,
 45609,
 18355,
 0,
 888,
 10057,
 10897,
 9327,
 19186,
 2640,
 8851,
 30346,
 27526,
 10714,
 284,
 488,
 5925,
 914,
 625,
 6283,
 628,
 12106,
 0,
 11596,
 11236,
 0,
 2769,
 0,
 11575,
 790,
 14285,
 0,
 20457,
 63020,
 8850,
 32562,
 20759,
 9826,
 109478,
 0,
 15944,
 0,
 763,
 0,
 18451,
 0,
 6623,
 10217,
 0,
 0,
 0,
 9300,
 0,
 58770,
 31640,
 293934,
 0,
 422,
 0,
 11041,
 11112,
 0,
 19855,
 0,
 33245,
 44705,
 16372,
 11331,
 10467,
 0,
 22490,
 0,
 24257,
 0,
 0,
 47199,
 0,
 26203,
 110972,
 221917,
 0,
 0,
 85328,
 0,
 203119,
 0,
 0,
 0,
 0,
 368995,
 213917,
 241058,
 0,
 0,
 0,
 18919,
 0,
 0,
 268135,
 0,
 19931,
 0,
 0,
 95963,
 0,
 10533,
 26531,
 0,
 0,
 0,
 37141,
 0,
 0,
 0,
 0,
 0,
 36141,
 0,
 0,
 0,
 0,
 0,
 0,
 158312,
 0,
 0,
 0,
 21539,
 0,
 0,
 0,
 0,
 0,
 0,
 50463,
 0,
 42758,
 291634,
 0]

In [None]:
get_movie_data("Shanghai Triad", "1995")

In [436]:
items_enriched.iloc[0]

movie_id                                                                 1
title_x                                                   Toy Story (1995)
release_date_x                                                 01-Jan-1995
video_release_date                                                     NaN
imdb_url                 http://us.imdb.com/M/title-exact?Toy%20Story%2...
genre_unknown                                                            0
Action                                                                   0
Adventure                                                                0
Animation                                                                1
Children                                                                 1
Comedy                                                                   1
Crime                                                                    0
Documentary                                                              0
Drama                    