In [1]:
import numpy as np
import pandas as pd
import csv

# Predict via the user-specific median.
# If the user has no data, use the global median.

train_file = 'data/train.csv'
test_file  = 'data/test.csv'
soln_file  = 'data/user_median.csv'
artist_tags_file = 'data/artists_clustered.csv'

In [2]:
# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = int(plays)

In [3]:
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [4]:
cluster_pd = pd.read_csv(artist_tags_file)
artist_to_cluster = {}
for idx, row in cluster_pd.iterrows():
    artist_to_cluster[row['artist']] = row['prediction']

In [5]:
user_medians_per_cluster = {}
for user, user_data in train_data.iteritems():
    cluster_plays = [ [] for i in xrange(50) ]

    for artist, plays in user_data.iteritems():
        cluster_plays[artist_to_cluster[artist]].append(plays)

    for cluster_play in cluster_plays:
        if not user in user_medians_per_cluster:
            user_medians_per_cluster[user] = []

        if len(cluster_play) == 0:
            user_medians_per_cluster[user].append(user_medians[user])
        else:
            user_medians_per_cluster[user].append(np.median(np.array(cluster_play)))

In [6]:
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                cluster = int(artist_to_cluster[artist])
                soln_csv.writerow([id, user_medians_per_cluster[user][cluster]])
            else:
                print "User", id, "not in training data."
                soln_csv.writerow([id, global_median])
                