In [14]:
# Imports
import numpy as np
import csv
from sklearn import *

# Predict via the user-specific median.
# If the user has no data, use the global median.

# Hard-code file names
train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'predictions.csv'
profiles_file = 'profiles.csv'

In [26]:
# Load the profile data.
profile_data = {}
user_ids = []

with open(profiles_file, 'r') as profile_fh:
    profile_csv = csv.reader(profile_fh, delimiter=',', quotechar='"')
    next(profile_csv, None)
    for row in profile_csv:
        # user,sex,age,country
        user    = row[0]
        sex     = row[1]
        age     = row[2]
        country = row[3]

        if age == '':
            age = -1
        if sex == '':
            sex = 'u'
    
        if not user in profile_data:
            profile_data[user] = {}
            user_ids.append(user)
        
        profile_data[user]['sex'] = sex
        profile_data[user]['age'] = int(age)
        profile_data[user]['country'] = country

print len(user_ids)

233286


In [29]:
print "Number of users in profiles: " + str(len(user_ids))
print ""
for user_id in user_ids[1:20]:
    print profile_data[user_id]

Number of users in profiles: 233286

{'country': 'Iceland', 'age': 29, 'sex': 'm'}
{'country': 'United States', 'age': 30, 'sex': 'm'}
{'country': 'Germany', 'age': 21, 'sex': 'm'}
{'country': 'Netherlands', 'age': 24, 'sex': 'm'}
{'country': 'United States', 'age': 22, 'sex': 'm'}
{'country': 'United States', 'age': -1, 'sex': 'f'}
{'country': 'Poland', 'age': -1, 'sex': 'f'}
{'country': 'United States', 'age': -1, 'sex': 'm'}
{'country': 'Ukraine', 'age': 16, 'sex': 'u'}
{'country': 'Italy', 'age': 44, 'sex': 'm'}
{'country': 'Russian Federation', 'age': 17, 'sex': 'm'}
{'country': 'Germany', 'age': 25, 'sex': 'm'}
{'country': 'Austria', 'age': 24, 'sex': 'm'}
{'country': 'Australia', 'age': 22, 'sex': 'f'}
{'country': 'Germany', 'age': 34, 'sex': 'm'}
{'country': 'United States', 'age': -1, 'sex': 'm'}
{'country': 'United States', 'age': 22, 'sex': 'm'}
{'country': 'Canada', 'age': 22, 'sex': 'm'}
{'country': 'Germany', 'age': -1, 'sex': 'm'}


In [42]:
# Create indicator variables
columns = ['i_MALE','i_FEMALE','AGE']
ages = []
for user_id in user_ids:
    country = profile_data[user_id]['country']
    if country not in columns:
        columns.append(country)
    ages.append(profile_data[user_id]['age'])

# Calc the mean age
mean_age = np.mean(ages)
    
# Construct matrix
profile_matrix = np.zeros((len(user_ids), len(columns)))
for i, user_id in enumerate(user_ids):
    profile = profile_data[user_id]

    # Create indicator variable for MALE
    if profile['sex'] == 'm':
        profile_matrix[i, 0] = 1
    # Create indicator variable for FEMALE    
    elif profile['sex'] == 'f':
        profile_matrix[i, 1] = 1
        
    # Add a 1 for the country indicator
    country = profile['country']
    country_col = columns.index(country)
    profile_matrix[i, country_col] = 1

    # TODO: Calculate median age, replace nulls with 
    # TODO: Separate ages into 5 year indicator columns
    
    # Add age
    profile_matrix[i, 2] = profile['age']

In [75]:
user_pos_by_id = {}
for i, user_id in enumerate(user_ids):
    user_pos_by_id[user_id] = i

In [44]:
# Examine the data
profile_matrix.shape
profile_matrix[1:10]

array([[  1.,   0.,  29., ...,   0.,   0.,   0.],
       [  1.,   0.,  30., ...,   0.,   0.,   0.],
       [  1.,   0.,  21., ...,   0.,   0.,   0.],
       ..., 
       [  0.,   1.,  -1., ...,   0.,   0.,   0.],
       [  1.,   0.,  -1., ...,   0.,   0.,   0.],
       [  0.,   0.,  16., ...,   0.,   0.,   0.]])

In [59]:
# Load the training data.
train_data = {}
train_user_ids = []

# TRAIN-TEST SPLIT FOR TESTING PURPOSES
# Need to split on a per-user-artist level, not just per-user level
train_train_data = {}
train_test_data = {}

with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
            train_user_ids.append(user)
        
        train_data[user][artist] = int(plays)
        
        # Build train and test split
        if np.random.uniform(0,1,1) < .75:
            if not user in train_train_data:
                train_train_data[user] = {}
            train_train_data[user][artist] = int(plays)
        else:
            if not user in train_test_data:
                train_test_data[user] = {}
            train_test_data[user][artist] = int(plays)

In [8]:
# Examine the data
print "Number of users in train: " + str(len(train_user_ids))
print ""
for user_id in train_user_ids[1:3]:
    print train_data[user_id]

Number of users in train: 233286

{'e3e0abcd-7671-4482-a9d8-462f5acc9be5': 64, '63011a8d-0117-4f7e-9991-1ef1f337ff70': 13, 'f4857fb9-e255-4dc6-bd01-e4ca7cc68544': 21, 'c485632c-b784-4ee9-8ea1-c5fb365681fc': 45, 'a96ac800-bfcb-412a-8a63-0a98df600700': 35, '8dd98bdc-80ec-4e93-8509-2f46bafc09a7': 23, '69837400-8e31-4949-aac2-00b46b4df126': 18, 'a3a92047-be1c-4f3e-8960-c4f8570984df': 81, '648615ca-ca74-460d-928a-2bae67ae6d14': 19, '0110e63e-0a9b-4818-af8e-41e180c20b9a': 22, '6ffb8ea9-2370-44d8-b678-e9237bbd347b': 56, '9fdaa16b-a6c4-4831-b87c-bc9ca8ce7eaa': 20, '5441c29d-3602-4898-b1a1-b77fa23b8e50': 70, '9bf79f68-c064-44a1-8c2c-5764f1d7c016': 27, '4a4ee089-93b1-4470-af9a-6ff575d32704': 31, '9efff43b-3b29-4082-824e-bc82f646f93d': 22}
{'8d18b680-368c-4649-a5e3-85e0c2dd6fc2': 51, 'a4a3048f-3968-4848-9f53-94e3d4f88b53': 47, '6ffb8ea9-2370-44d8-b678-e9237bbd347b': 86, '9c9f1380-2516-4fc9-a3e6-f9f61941d090': 145, 'eeb1195b-f213-4ce1-b28c-8565211f8e43': 708, '24ea074c-59cc-41c5-a5de-f68c2952965f'

In [67]:
# CALCULATE THE NUMBER OF SONGS WE ARE ESTIMATING IN OUR TRIAN_TEST SAMPLE
num_songs_estimating = 0

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        num_songs_estimating += 1
        
print num_songs_estimating

1039494


## KMeans - Let's Cluster the Users

In [85]:
KM = sklearn.cluster.KMeans(n_clusters=50, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=37)
# Calls fit and then predict
predict = KM.fit_predict(profile_matrix)

In [86]:
print "The objective function: %f" % KM.score(profile_matrix) 

The objective function: -258989.198517


In [54]:
# Examine the predicted clusters
print predict[1:10]

[3 3 7 8 7 2 2 2 0]


## Keep track of score

10 clusters
Indicators for basic params: -948858.492895

20 clusters
Indicatros for basic params: -448359.382828

## Test different techniques

### GLOBAL & PER-USER MEDIANS (GIVEN)

In [65]:
abs_error = 0

# TEST WITH GLOBAL and per-USER MEDIAN (GIVEN)
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        if user in user_medians:
            abs_error += abs(user_medians[user] - plays)
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
MEAN ABSOLUTE ERROR 140.056850


### KMEANS CLUSTERING

In [81]:
abs_error = 0

cluster_artist_plays = {}

for user, user_data in train_train_data.iteritems():
    cluster_id = predict[user_pos_by_id[user]]
    for artist, plays in user_data.iteritems():
        if cluster_id not in cluster_artist_plays:
            cluster_artist_plays[cluster_id] = {}
        
        if artist not in cluster_artist_plays[cluster_id]:
            cluster_artist_plays[cluster_id][artist] = []
        
        cluster_artist_plays[cluster_id][artist].append(plays)

print "Finished setting up cluster author play dictionary, running test now"

# Precalculate the cluster-artist medians
cluster_artist_medians = {}
for cluster_id, cluster_data in cluster_artist_plays.iteritems():
    artist_medians = {}
    for artist, plays in cluster_data.iteritems():
        artist_medians[artist] = np.median(plays)
    cluster_artist_medians[cluster_id] = artist_medians

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        cluster_id = predict[user_pos_by_id[user]]
        
        if artist in cluster_artist_plays[cluster_id]:
            abs_error += abs(np.median(cluster_artist_plays[cluster_id][artist]) - plays)
        elif user in user_medians:
            abs_error += abs(user_medians[user] - plays)
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            #print "Cluster+Artist: ", cluster_id, artist, "not in train_train data."
            #abs_error += abs(0 - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

Finished setting up cluster author play dictionary, running test now
MEAN ABSOLUTE ERROR 199.005376


In [87]:
abs_error = 0

cluster_artist_plays = {}

# The median number of per user per cluster
cluster_user_plays = {}

# The cluster medians
cluster_medians = {}

user_medians = {}

for user, user_data in train_train_data.iteritems():
    cluster_id = predict[user_pos_by_id[user]]
    user_plays = []
    for artist, plays in user_data.iteritems():
        if cluster_id not in cluster_artist_plays:
            cluster_artist_plays[cluster_id] = {}
        
        if artist not in cluster_artist_plays[cluster_id]:
            cluster_artist_plays[cluster_id][artist] = []
        
        cluster_artist_plays[cluster_id][artist].append(plays)

        user_plays.append(plays)

    median = np.median(np.array(user_plays))
    user_medians[user] = median
    if cluster_id not in cluster_user_plays:
        cluster_user_plays[cluster_id] = []
    cluster_user_plays[cluster_id].append(median) 
    
# Calculate per-cluster user median
for cluster_id, user_plays in cluster_user_plays.iteritems():
    cluster_medians[cluster_id] = np.median(user_plays)
    
user_ratios = {}
    
# Calculate the user-cluster play ratios
for user_id, median_plays in user_medians.iteritems():
    cluster_id = predict[user_pos_by_id[user_id]]
    user_ratios[user_id] = 1.0 * median_plays / cluster_medians[cluster_id]
        
print "Finished setting up cluster author play dictionary, running test now"

# Precalculate the cluster-artist medians
cluster_artist_medians = {}
for cluster_id, cluster_data in cluster_artist_plays.iteritems():
    artist_medians = {}
    for artist, plays in cluster_data.iteritems():
        artist_medians[artist] = np.median(plays)
    cluster_artist_medians[cluster_id] = artist_medians

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        cluster_id = predict[user_pos_by_id[user]]
        if artist in cluster_artist_plays[cluster_id]:
            user_ratio = 1.0
            if user in user_ratios:
                user_ratio = user_ratios[user]
            prediction = user_ratio * cluster_artist_medians[cluster_id][artist]
            abs_error += abs(prediction - plays)
        elif user in user_medians:
            abs_error += abs(user_medians[user] - plays)
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            #print "Cluster+Artist: ", cluster_id, artist, "not in train_train data."
            #abs_error += abs(0 - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

Finished setting up cluster author play dictionary, running test now
MEAN ABSOLUTE ERROR 153.856717


# MEAN ABSOLUTE ERROR

Global + Per-User (Given): 140.056850

KMeans (10 clusters): 198.894616

KMeans (20 clusters): 199.083787
KMeans (20 clusters) + user + global: 199.005376

KMeans (20 clusters) + per-user ratio: 149.274154

KMeans (50 clusters) + per-user ratio: 153.856717

## OUTPUT

In [30]:
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                soln_csv.writerow([id, user_medians[user]])
            else:
                print "User", id, "not in training data."
                soln_csv.writerow([id, global_median])
                