In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import seaborn as sns
import sys
import sklearn.metrics as metrics

np.set_printoptions(threshold=sys.maxsize)
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Load the Ratings data
ratings_df = pd.read_csv('data/u.data', sep="\t", header=None)
ratings_df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = ratings_df.drop(['timestamp'], axis=1)
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [3]:
'''sort rating df by user id and item id'''
rating_sort = ratings.sort_values(by=['user_id', 'movie_id']).reset_index(drop=True)
rating_sort

Unnamed: 0,user_id,movie_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
99995,943,1067,2
99996,943,1074,4
99997,943,1188,3
99998,943,1228,3


In [4]:
#Load the Movies data and add genres as cloums
item = pd.read_csv('data/u.item', sep="|", encoding='latin-1', header=None)
item.columns = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 
                'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
#remove unnecessary columns
item_genres01 = item.drop(['movie title', 'release date', 'video release date', 'IMDb URL', 'unknown' ], axis=1)
item_genres01

Unnamed: 0,movie_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
#save df
item_genres01.to_csv('item_genres01.csv')

In [6]:
#merge ratings with items df
user_genre = pd.merge(rating_sort,item_genres01)
user_genre

Unnamed: 0,user_id,movie_id,rating,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,5,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,4,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,1,4,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,1,4,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10,1,4,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,1678,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,863,1679,3,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
99997,863,1680,2,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99998,896,1681,3,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#replace the value 1 in each of the genres columns with the corresponding value from the rating column of the user_genre DataFrame

ratings1 = user_genre.replace({"Action":{1:user_genre.rating},"Adventure":{1:user_genre.rating},
                            "Animation":{1:user_genre.rating},"Children's":{1:user_genre.rating},"Comedy":{1:user_genre.rating},
                            "Crime":{1:user_genre.rating},"Documentary":{1:user_genre.rating},"Drama":{1:user_genre.rating},
                            "Fantasy":{1:user_genre.rating},"Film-Noir":{1:user_genre.rating},"Horror":{1:user_genre.rating},
                            "Musical":{1:user_genre.rating},"Mystery":{1:user_genre.rating},"Romance":{1:user_genre.rating},
                            "Sci-Fi":{1:user_genre.rating},"Thriller":{1:user_genre.rating},"War":{1:user_genre.rating},  
                            "Western":{1:user_genre.rating}})

In [8]:
#sort df by userID
ratings2 = ratings1.sort_values('user_id').drop(['movie_id', 'rating'], axis=1).reset_index(drop=True)  
ratings2

Unnamed: 0,user_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,4,4,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0
2,1,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,5,0,0
3,1,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,943,0,0,0,0,4,0,0,4,0,0,0,0,0,0,0,0,0,0
99996,943,4,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,4,0
99997,943,3,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0
99998,943,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0


In [9]:
#Replace 0 values with NaN to avoid errors in calculating the average

ratings3 = ratings2.replace({"Action":{0:np.nan},"Adventure":{0:np.nan},
                            "Animation":{0:np.nan},"Children's":{0:np.nan},"Comedy":{0:np.nan},
                            "Crime":{0:np.nan},"Documentary":{0:np.nan},"Drama":{0:np.nan},
                            "Fantasy":{0:np.nan},"Film-Noir":{0:np.nan},"Horror":{0:np.nan},
                            "Musical":{0:np.nan},"Mystery":{0:np.nan},"Romance":{0:np.nan},
                            "Sci-Fi":{0:np.nan},"Thriller":{0:np.nan},"War":{0:np.nan},  
                            "Western":{0:np.nan}})
ratings3

Unnamed: 0,user_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,,,5.0,5.0,5.0,,,,,,,,,,,,,
1,1,4.0,4.0,,,,,,,,,,,,,4.0,,,
2,1,,,,,,,,5.0,,,,,,,,5.0,,
3,1,,,,,,,,5.0,,,,,,,,,,
4,1,,,,2.0,2.0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,943,,,,,4.0,,,4.0,,,,,,,,,,
99996,943,4.0,,,,,,,4.0,,,,,,,,,4.0,
99997,943,3.0,3.0,,,,,,,,,,,,,3.0,,,
99998,943,,,,,,,,4.0,,,,,,,,,,


In [10]:
#Get the average of users ratings

ratings4 = ratings3.groupby('user_id').mean().fillna(0)
ratings4

Unnamed: 0_level_0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,3.333333,2.928571,3.333333,2.200000,3.472527,3.440000,4.8,3.925234,3.5,5.0,3.461538,2.923077,3.600000,3.931818,4.000000,3.615385,3.680000,3.666667
2,3.800000,4.333333,4.000000,3.000000,3.812500,3.777778,0.0,3.828571,3.0,4.5,3.000000,3.000000,3.500000,4.125000,3.750000,3.583333,3.666667,0.000000
3,2.785714,3.500000,0.000000,0.000000,2.583333,3.000000,5.0,2.909091,0.0,2.5,2.400000,2.000000,3.181818,3.400000,2.750000,2.523810,2.800000,0.000000
4,3.875000,3.500000,0.000000,0.000000,5.000000,4.750000,5.0,4.500000,0.0,0.0,4.000000,5.000000,4.000000,4.333333,3.833333,3.909091,4.500000,0.000000
5,3.142857,3.242424,3.785714,2.448276,3.000000,3.888889,0.0,2.666667,2.5,5.0,2.535714,3.333333,3.000000,2.315789,3.515152,2.947368,3.214286,2.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,4.055556,3.500000,4.000000,4.000000,4.533333,4.333333,0.0,4.555556,4.0,0.0,3.000000,4.000000,4.000000,4.800000,4.125000,4.083333,5.000000,0.000000
940,3.434783,3.000000,4.500000,4.200000,3.523810,4.285714,0.0,3.604167,0.0,4.0,3.000000,3.142857,4.333333,3.541667,3.052632,3.350000,3.266667,0.000000
941,3.800000,3.857143,4.666667,4.500000,4.142857,3.000000,0.0,4.200000,0.0,0.0,0.000000,4.000000,5.000000,5.000000,3.875000,4.000000,5.000000,0.000000
942,4.111111,4.727273,4.750000,4.307692,4.090909,0.000000,0.0,4.419355,4.0,5.0,3.666667,4.400000,4.000000,4.411765,4.166667,4.000000,4.700000,4.666667


In [12]:
ratings4.to_csv('AV_user_genre.csv')