In [4]:
# Imports
import numpy as np
import csv
import sklearn
import pandas as pd

# Predict via the user-specific median.
# If the user has no data, use the global median.

# Hard-code file names
train_file = 'data/train.csv'
test_file  = 'data/test.csv'
soln_file  = 'data/predictions.csv'
profiles_file = 'data/profiles.csv'
artist_file = 'data/artists.csv'
artist_tags_file = 'data/artists_tagged.csv'

In [5]:
df = pd.read_csv(train_file)
print df.shape

(4154804, 3)


In [6]:
artist_df = pd.read_csv(artist_file)
artists = {}
for idx, artist in enumerate(artist_df['artist']):
    artists[artist] = idx

In [7]:
user_df = pd.read_csv(profiles_file)
users = {}
for idx, user in enumerate(user_df['user']):
    users[user] = idx

In [8]:
n_users = df.user.unique().shape[0]
n_items = df.artist.unique().shape[0]
print str(n_users) + ' users'
print str(n_items) + ' items'

233286 users
2000 items


In [9]:
ratings = np.zeros((n_users, n_items))
for idx, row in enumerate(df.itertuples()):
    if idx % 1000000 == 0:
        print idx
    ratings[users[row[1]], artists[row[2]]] = row[3]
ratings

0
1000000
2000000
3000000
4000000


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [15]:
# sparsity = float(len(ratings.nonzero()[0]))
# sparsity /= (ratings.shape[0] * ratings.shape[1])
# sparsity *= 100
# print 'Sparsity: {:4.2f}%'.format(sparsity)

apu = float(len(ratings.nonzero()[0]))
print apu
print ratings.shape[1]
apu /= (ratings.shape[1])
print apu

total_plays = sum(ratings.nonzero()[0])
print total_plays



4154804.0
2000
2077.402
[   148    425    527 ..., 234933 234953 235172]


In [16]:
print sum(ratings.nonzero()[0])

484639466854


In [8]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in xrange(ratings.shape[0]):
        if len(ratings[user, :].nonzero()) > 10:
            test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                            size=3, 
                                            replace=False)
            train[user, test_ratings] = 0.
            test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

In [9]:
train, test = train_test_split(ratings)

In [None]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [None]:
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')
print item_similarity[:4, :4]

In [None]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [None]:
predict_fast_simple(train, user_similarity, kind='user')