In [11]:
import numpy as np
import pandas as pd
import operator

In [12]:
def load_rating_data():
    
    movies = pd.read_table('data/movies.dat', sep='::', header=None, names=['item', 'movie', 'genres'])
    movies['year'] = movies['movie'].map(lambda x: int(x[-5:-1]))
    movies['movie'] = movies['movie'].map(lambda x: x[:-7])
    movies['genres'] = movies['genres'].map(lambda x: x.split('|'))
    
    users = pd.read_table('data/users.dat', sep='::', header=None, names=['userid', 'gender', 'age-group', 'occupation', 'zip'])
    gender_dummies = pd.get_dummies(users.gender, prefix='gender')
    users = users.merge(gender_dummies, left_index=True, right_index=True)
    users.drop('gender', axis=1, inplace=True)
    users.drop('gender_M', axis=1, inplace=True)
    users.drop('zip', axis=1, inplace=True)

    
    
    return movies, users

In [13]:
movies, users = load_rating_data()



In [4]:
"""
Naive model:
age-group and maybe occupation to find most popular movies 
to suggest to a user whose movie preferences you know nothing about

Smarter model:
If you do have ratings for 
"""

'\nNaive model:\nage-group and maybe occupation to find most popular movies \nto suggest to a user whose movie preferences you know nothing about\n\nSmarter model:\nIf you do have ratings for \n'

In [5]:
prediction_popularity = pd.read_csv('data/predict_ratings_popularity.csv')

In [6]:
test_targets = pd.read_csv('data/dont_use.csv')

In [7]:
prediction_popularity.sort('rating', ascending=False).shape

(500109, 3)

In [8]:
movies.head()

Unnamed: 0,item,movie,genres,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [9]:
genres = []
for row in movies['genres']:
    genres += row
genres = set(genres)
    


In [10]:
genres
genres_d = {}
for i, genre in enumerate(genres):
    genres_d[genre] = i
genres_d

{'Action': 12,
 'Adventure': 11,
 'Animation': 10,
 "Children's": 8,
 'Comedy': 13,
 'Crime': 6,
 'Documentary': 14,
 'Drama': 1,
 'Fantasy': 3,
 'Film-Noir': 5,
 'Horror': 4,
 'Musical': 9,
 'Mystery': 0,
 'Romance': 7,
 'Sci-Fi': 2,
 'Thriller': 16,
 'War': 15,
 'Western': 17}

In [11]:
genre_mat = np.zeros((len(movies), len(genres)))


In [12]:
i = 0
for movie_genres in movies['genres']:
    for genre in movie_genres:
        genre_mat[i][genres_d[genre]] = 1
    i += 1

In [13]:
sort = sorted(genres_d.items(), key=operator.itemgetter(1))
sort = [x[0] for x in sort]
print sort
genres = pd.DataFrame(genre_mat, columns=sort)

['Mystery', 'Drama', 'Sci-Fi', 'Fantasy', 'Horror', 'Film-Noir', 'Crime', 'Romance', "Children's", 'Musical', 'Animation', 'Adventure', 'Action', 'Comedy', 'Documentary', 'War', 'Thriller', 'Western']


In [14]:
genres.head()

Unnamed: 0,Mystery,Drama,Sci-Fi,Fantasy,Horror,Film-Noir,Crime,Romance,Children's,Musical,Animation,Adventure,Action,Comedy,Documentary,War,Thriller,Western
0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [15]:
pd.merge(movies, genres, left_index=True, right_index=True)

Unnamed: 0,item,movie,genres,year,Mystery,Drama,Sci-Fi,Fantasy,Horror,Film-Noir,...,Children's,Musical,Animation,Adventure,Action,Comedy,Documentary,War,Thriller,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",1995,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,0,0
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,[Comedy],1995,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,6,Heat,"[Action, Crime, Thriller]",1995,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
6,7,Sabrina,"[Comedy, Romance]",1995,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck,"[Adventure, Children's]",1995,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
8,9,Sudden Death,[Action],1995,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9,10,GoldenEye,"[Action, Adventure, Thriller]",1995,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0


In [16]:
movies.shape

(3883, 4)

In [17]:
prediction_popularity.shape

(500109, 3)

In [18]:
len(movies) * len(users)

23453320

In [19]:
prediction_popularity.head()

Unnamed: 0,user,rating,id
0,1,4.430283,1_1193
1,1,3.519802,1_661
2,1,4.166124,1_914
3,1,3.933333,1_3408
4,1,3.872768,1_2355


In [20]:
ratings.shape

NameError: name 'ratings' is not defined

In [None]:
ratings

In [None]:
prediction_popularity.head()

In [21]:
merged = prediction_popularity.merge(test_targets, on='id')

In [None]:
merged.sort('rating_x')

In [None]:
sample_sub = pd.read_csv("data/sample_submission.csv")

In [None]:
sample_sub.shape

In [None]:
sample_sub.head()

In [None]:
ratings.shape

In [14]:
ratings = pd.read_csv('data/training_ratings.csv')

In [None]:
def get_user_rating(user):
    

In [15]:
num_ratings_per_user = ratings.groupby('user').count()

In [None]:
ratings.head()

In [16]:
users = pd.merge(users, num_ratings_per_user, left_on='userid', right_index=True, how='outer')

In [23]:
users = users.drop(['movie', 'rating'], axis=1).fillna(0)

In [None]:
len(pd.unique(ratings['user']))

In [None]:
len(users)

In [None]:
users

In [None]:
prediction_popularity.head()

In [None]:
for row in prediction_popularity.iterrows():
    print row[1]['rating']
    break

In [26]:
hot = prediction_popularity
cold = prediction_popularity

In [None]:
final_pred = cold
for i in xrange(hot.shape[0]):
    user_id = hot['user'][i]
    user_ratings = users['id'][user_id]
    if user_ratings >= 3:
        rating = hot['rating'][i]
        final_pred['rating'][i] = rating
    if i%100000 == 0:
        print i

0
100000
200000
300000
400000

In [6]:
hot = pd.read_csv("data/predict_rankings_factorization.csv")
cold = pd.read_csv("data/predict_ratings_popularity.csv")

In [7]:
hot.describe()

Unnamed: 0,user,rating
count,500109.0,500109.0
mean,1640.429086,3.52635
std,1133.749679,0.355208
min,1.0,0.397404
25%,802.0,3.313746
50%,1506.0,3.537019
75%,2186.0,3.749439
max,6040.0,5.790454


In [8]:
cold.head()

Unnamed: 0,user,rating,id
0,1,4.430283,1_1193
1,1,3.519802,1_661
2,1,4.166124,1_914
3,1,3.933333,1_3408
4,1,3.872768,1_2355


In [52]:
merged = pd.merge(cold, users, how='left', left_on='user', right_on='userid')
mask = np.array(merged['id_y'] >= 20)

cold_ratings = np.array(cold['rating'])
hot_ratings = np.array(hot['rating'])

In [53]:
cold_ratings

array([ 4.43028322,  3.51980198,  4.16612378, ...,  3.68975069,
        3.58823529,  3.96023564])

In [54]:
np.putmask(cold_ratings, mask, hot_ratings)

In [55]:
cold_ratings

array([ 4.43028322,  3.51980198,  4.16612378, ...,  2.28210519,
        3.27403537,  3.30676589])