## Load, join and clean data

In [5]:
import pandas as pd
import numpy as np

In [3]:
genres_data = pd.read_csv('data/u.genre', sep = '|', encoding = "ISO-8859-1", header = None, names=['name', 'id'])

display(genres_data)

Unnamed: 0,name,id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [49]:
movie_data_columns = np.append(['movie_id', 'title', 'release_date', 'video_release_date', 'url'], genres_data['name'].values)

print(movie_data_columns)

['movie_id' 'title' 'release_date' 'video_release_date' 'url' 'unknown'
 'Action' 'Adventure' 'Animation' "Children's" 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']


In [50]:
movie_data = pd.read_csv('data/u.item', sep = '|', encoding = "ISO-8859-1", 
                         header = None, names = movie_data_columns,index_col = 'movie_id')
movie_data = movie_data.drop(columns=['video_release_date', 'url'])
movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])

display(movie_data.head(5))

Unnamed: 0_level_0,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [51]:
ratings_data = pd.read_csv('data/u.data', sep = '\t', encoding = "ISO-8859-1", header = None, 
                           names=['user_id', 'movie_id', 'rating', 'timestamp'])
display(ratings_data.head(5))


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [52]:
movie_data['ratings_average'] = ratings_data.groupby(['movie_id'])['rating'].mean()
movie_data['ratings_count'] = ratings_data.groupby(['movie_id'])['rating'].count()

display(movie_data.head(5))

Unnamed: 0_level_0,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,ratings_average,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,3.878319,452
2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3.206107,131
3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3.033333,90
4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,3.550239,209
5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,3.302326,86


In [53]:
print(movie_data.shape)
movie_data = movie_data.dropna(how='any')
print(movie_data.shape)

(1682, 23)
(1681, 23)


##  Normalize date

In [54]:
oldest_date = pd.to_datetime(movie_data['release_date']).min()
print(oldest_date)
newest_date = pd.to_datetime(movie_data['release_date']).max()
print(newest_date)

movie_data['release_date'] = (newest_date - pd.to_datetime(movie_data['release_date'])) / (newest_date - oldest_date)

display(movie_data.head(5))

1922-01-01 00:00:00
1998-10-23 00:00:00


Unnamed: 0_level_0,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,ratings_average,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.049583,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,3.878319,452
2,GoldenEye (1995),0.049583,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3.206107,131
3,Four Rooms (1995),0.049583,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3.033333,90
4,Get Shorty (1995),0.049583,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,3.550239,209
5,Copycat (1995),0.049583,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,3.302326,86


In [55]:
movie_data['ratings_average'] = 1-((5 - movie_data['ratings_average']) / (5-1))

display(movie_data.head(5))

Unnamed: 0_level_0,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,ratings_average,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.049583,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0.71958,452
2,GoldenEye (1995),0.049583,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.551527,131
3,Four Rooms (1995),0.049583,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.508333,90
4,Get Shorty (1995),0.049583,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0.63756,209
5,Copycat (1995),0.049583,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0.575581,86


## Add attributes

In [57]:
movie_data['price'] = np.round((1 - movie_data['ratings_average']) * (1 - movie_data['release_date']) * 10)

display(movie_data.head(5))

Unnamed: 0_level_0,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,ratings_average,ratings_count,price
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.049583,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0.71958,452,3.0
2,GoldenEye (1995),0.049583,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0.551527,131,4.0
3,Four Rooms (1995),0.049583,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.508333,90,5.0
4,Get Shorty (1995),0.049583,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0.63756,209,3.0
5,Copycat (1995),0.049583,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0.575581,86,4.0


In [58]:
movie_data['buy_probability'] = 1 - movie_data['price'] * 0.1

display(movie_data.head(5))

Unnamed: 0_level_0,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,ratings_average,ratings_count,price,buy_probability
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.049583,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0.71958,452,3.0,0.7
2,GoldenEye (1995),0.049583,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.551527,131,4.0,0.6
3,Four Rooms (1995),0.049583,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0.508333,90,5.0,0.5
4,Get Shorty (1995),0.049583,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0.63756,209,3.0,0.7
5,Copycat (1995),0.049583,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0.575581,86,4.0,0.6


## Save to csv

In [59]:
movie_data.to_csv('movie_data.csv')