In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import fileinput

#Fit the code to the screen
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading movies data

In [2]:
#Reading the movies data
my_path = !pwd
movies_filename = "/ml-1m/movies.dat"
dfmovies = pd.read_csv(my_path[0]+movies_filename, sep='::', header=None)
dfmovies.columns = ["movieId","title", "genres"]



In [3]:
#Extract the year from the title
dfmovies["year"] = dfmovies['title'].apply(lambda x: x[x.rfind("(")+1:x.rfind(")")])
dfmovies["name"] = dfmovies['title'].apply(lambda x: x[:x.rfind("(")-1])
del dfmovies['title']
#Get the columns to the original order
cols = dfmovies.columns.tolist()
cols = cols[:1] + cols[-1:] + cols[-2:-1] + cols[1:2]
dfmovies = dfmovies[cols]

### Reading ratings data

In [4]:
#Reading the ratings data
ratings_filename = "/ml-1m/ratings.dat"
dfratings = pd.read_csv(my_path[0]+ratings_filename, index_col=None, sep='::', header=None)
dfratings.columns = ["userId", "movieId", "rating", "timestamp"]

  app.launch_new_instance()


## Creating genres vector for each movie

In [5]:
#Disable false positive Pandas warning for 'SettingWithCopyWarning'
pd.options.mode.chained_assignment = None

#Adding genres vector to each movie
all_genres = ['Mystery', 'Sci-Fi', 'Crime', 'Drama', 'Animation', 'IMAX', 'Action', 'Comedy', 'Documentary', 'War', 'Romance', 'Horror', 
              'Film-Noir', 'Musical', 'Fantasy', 'Adventure', 'Children', 'Thriller', 'Western']
for genre in all_genres:
    dfmovies[genre] = dfmovies['genres'].apply(lambda x: x.find(genre)>=0).astype(int)

## Create users genre preference vectors

In [20]:
users = (dfratings["userId"]).unique()
genres_pref = []
for userId in users:
    user_ratings = dfratings[dfratings["userId"] == userId]
    movies_total = 0
    geners_rating_score = [0] * len(all_genres)
    for index, rating in user_ratings.iterrows():
        movieId = rating["movieId"]
        score = rating["rating"]
        movie_data = dfmovies.loc[dfmovies["movieId"] == movieId]
        for genre in all_genres:
            if (movie_data[genre].item() == 1):
                geners_rating_score[all_genres.index(genre)] += score
    genres_pref.append(geners_rating_score)
    if (userId % 1000 == 0):
        print userId

1000
2000
3000
4000
5000
6000


In [30]:
#Save the genres prefrence to dataframe and to pickle
user_pref_df = pd.DataFrame(genres_pref)
user_pref_df.columns = all_genres
with open(my_path[0]+'/explore/user_genre_pref.pik', 'wb') as handle:
    pickle.dump(user_pref_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
with open(my_path[0]+'/explore/user_genre_pref.pik', 'rb') as handle:
    user_pref_df = pickle.load(handle)

## Unify all the users features

### Reading the users data

In [32]:
users_filename = "/ml-1m/users.dat"
dfusers = pd.read_csv(my_path[0]+users_filename, sep='::', header=None)
dfusers.columns = ["userId","gender", "age", "occupation", "zip-code"]

  from ipykernel import kernelapp as app


In [35]:
#Extracting gender, age and occupation features
del dfusers["zip-code"]
user_features = dfusers.join(user_pref_df)

In [21]:
#Calculating number of ratings and average rating per user
users = (dfratings["userId"]).unique()
user_rate_num = []
user_rate_avg = []
user_rate_std = []
for userId in users:
    user_ratings = dfratings[dfratings["userId"] == userId]
    user_rate_num.append(user_ratings.shape[0])
    user_rate_avg.append(np.average(user_ratings["rating"]))
    user_rate_std.append(np.std(user_ratings["rating"]))

In [25]:
user_features["rate_num"] = user_rate_num
user_features["rate_avg"] = user_rate_avg
user_features["rate_std"] = user_rate_std

In [28]:
with open(my_path[0]+'/features/user_features.pik', 'wb') as handle:
    pickle.dump(user_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
with open(my_path[0]+'/features/user_features.pik', 'rb') as handle:
    user_features = pickle.load(handle)

### Calculate movies features

In [6]:
#Calculate average rating and number of ratings
all_movies = dfmovies["movieId"]
avg_rate = []
num_rate = []
std_rate = []
for movie in all_movies:
    movie_data = dfratings[dfratings["movieId"] == movie]
    num_rate.append(movie_data.shape[0])
    avg_rate.append(np.average(movie_data["rating"]))
    std_rate.append(np.std(movie_data["rating"]))



In [7]:
#Removing movies with 0 reviews
movies_features = dfmovies.copy(deep=True)
exclude_movies = []
for i in sorted(all_movies.index, reverse=True):
    if (num_rate[i] == 0):
        exclude_movies.append(all_movies[i])
        del avg_rate[i]
        del num_rate[i]
        movies_features.drop(movies_features.index[i], inplace=True)

## Unify all movies features

In [15]:
del movies_features["genres"]
movies_features["avg_rate"] = avg_rate
movies_features["num_rate"] = num_rate
movies_features["std_rate"] = std_rate

In [19]:
with open(my_path[0]+'/features/movie_features.pik', 'wb') as handle:
    pickle.dump(movies_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
with open(my_path[0]+'/features/movie_features.pik', 'rb') as handle:
    movies_features = pickle.load(handle)