In [7]:
# Imports
import numpy as np
import csv
from sklearn import *

# Predict via the user-specific median.
# If the user has no data, use the global median.

# Hard-code file names
train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'predictions.csv'
profiles_file = 'profiles.csv'
artist_file = 'artists.csv'

In [8]:
# Load the profile data.
profile_data = {}
user_ids = []

with open(profiles_file, 'r') as profile_fh:
    profile_csv = csv.reader(profile_fh, delimiter=',', quotechar='"')
    next(profile_csv, None)
    for row in profile_csv:
        # user,sex,age,country
        user    = row[0]
        sex     = row[1]
        age     = row[2]
        country = row[3]

        if age == '':
            age = -1
        if sex == '':
            sex = 'u'
    
        if not user in profile_data:
            profile_data[user] = {}
            user_ids.append(user)
        
        profile_data[user]['sex'] = sex
        profile_data[user]['age'] = int(age)
        profile_data[user]['country'] = country

print len(user_ids)

233286


In [9]:
print "Number of users in profiles: " + str(len(user_ids))
print ""
for user_id in user_ids[1:20]:
    print profile_data[user_id]

Number of users in profiles: 233286

{'country': 'Iceland', 'age': 29, 'sex': 'm'}
{'country': 'United States', 'age': 30, 'sex': 'm'}
{'country': 'Germany', 'age': 21, 'sex': 'm'}
{'country': 'Netherlands', 'age': 24, 'sex': 'm'}
{'country': 'United States', 'age': 22, 'sex': 'm'}
{'country': 'United States', 'age': -1, 'sex': 'f'}
{'country': 'Poland', 'age': -1, 'sex': 'f'}
{'country': 'United States', 'age': -1, 'sex': 'm'}
{'country': 'Ukraine', 'age': 16, 'sex': 'u'}
{'country': 'Italy', 'age': 44, 'sex': 'm'}
{'country': 'Russian Federation', 'age': 17, 'sex': 'm'}
{'country': 'Germany', 'age': 25, 'sex': 'm'}
{'country': 'Austria', 'age': 24, 'sex': 'm'}
{'country': 'Australia', 'age': 22, 'sex': 'f'}
{'country': 'Germany', 'age': 34, 'sex': 'm'}
{'country': 'United States', 'age': -1, 'sex': 'm'}
{'country': 'United States', 'age': 22, 'sex': 'm'}
{'country': 'Canada', 'age': 22, 'sex': 'm'}
{'country': 'Germany', 'age': -1, 'sex': 'm'}


In [10]:
# Create indicator variables
columns = ['i_MALE','i_FEMALE']

# Add countries
for user_id in user_ids:
    country = profile_data[user_id]['country']
    if country not in columns:
        columns.append(country)

# Add ages
ages = [15, 20, 25, 30, 35, 40, 45, 50, 55, 10000] # AGES
columns.extend(ages)
    
# Construct matrix
profile_matrix = np.zeros((len(user_ids), len(columns)))
age_matrix = np.zeros((len(user_ids), len(ages)))
for i, user_id in enumerate(user_ids):
    profile = profile_data[user_id]

    # Create indicator variable for MALE
    if profile['sex'] == 'm':
        profile_matrix[i, 0] = 1
    # Create indicator variable for FEMALE    
    elif profile['sex'] == 'f':
        profile_matrix[i, 1] = 1
        
    # Add a 1 for the country indicator
    country = profile['country']
    country_col = columns.index(country)
    profile_matrix[i, country_col] = 1

    # TODO: Calculate median age, replace nulls with 
    
    # Add age
    index = 0
    for j, age in enumerate(ages):
        if profile['age'] < age:
            index = j
        else:
            break
    age_matrix[i, index] = 1
    
profile_matrix = np.hstack((profile_matrix, age_matrix))

In [11]:
# Examine the data
profile_matrix.shape
profile_matrix[1:10]

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  1.,  0., ...,  0.,  0.,  1.],
       [ 1.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [12]:
user_pos_by_id = {}
for i, user_id in enumerate(user_ids):
    user_pos_by_id[user_id] = i

In [13]:
# Load the artist data.
artist_names = {}
artist_ids = []

with open(artist_file, 'r') as artist_fh:
    artist_csv = csv.reader(artist_fh, delimiter=',', quotechar='"')
    next(artist_csv, None)
    for row in artist_csv:
        # user,sex,age,country
        artist    = row[0]
        name      = row[1]
        
        artist_names[artist] = name
        artist_ids.append(artist)
        
artist_pos_by_id = {}
for i, artist_id in enumerate(artist_ids):
    artist_pos_by_id[artist_id] = i

In [14]:
# Load the training data.
train_data = {}
train_user_ids = []

# TRAIN-TEST SPLIT FOR TESTING PURPOSES
# Need to split on a per-user-artist level, not just per-user level
train_train_data = {}
train_test_data = {}

with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
            train_user_ids.append(user)
        
        train_data[user][artist] = int(plays)
        
        # Build train and test split
        if np.random.uniform(0,1,1) < .75:
            if not user in train_train_data:
                train_train_data[user] = {}
            train_train_data[user][artist] = int(plays)
        else:
            if not user in train_test_data:
                train_test_data[user] = {}
            train_test_data[user][artist] = int(plays)

In [8]:
# Examine the data
print "Number of users in train: " + str(len(train_user_ids))
print ""
for user_id in train_user_ids[1:3]:
    print train_data[user_id]

Number of users in train: 233286

{'e3e0abcd-7671-4482-a9d8-462f5acc9be5': 64, '63011a8d-0117-4f7e-9991-1ef1f337ff70': 13, 'f4857fb9-e255-4dc6-bd01-e4ca7cc68544': 21, 'c485632c-b784-4ee9-8ea1-c5fb365681fc': 45, 'a96ac800-bfcb-412a-8a63-0a98df600700': 35, '8dd98bdc-80ec-4e93-8509-2f46bafc09a7': 23, '69837400-8e31-4949-aac2-00b46b4df126': 18, 'a3a92047-be1c-4f3e-8960-c4f8570984df': 81, '648615ca-ca74-460d-928a-2bae67ae6d14': 19, '0110e63e-0a9b-4818-af8e-41e180c20b9a': 22, '6ffb8ea9-2370-44d8-b678-e9237bbd347b': 56, '9fdaa16b-a6c4-4831-b87c-bc9ca8ce7eaa': 20, '5441c29d-3602-4898-b1a1-b77fa23b8e50': 70, '9bf79f68-c064-44a1-8c2c-5764f1d7c016': 27, '4a4ee089-93b1-4470-af9a-6ff575d32704': 31, '9efff43b-3b29-4082-824e-bc82f646f93d': 22}
{'8d18b680-368c-4649-a5e3-85e0c2dd6fc2': 51, 'a4a3048f-3968-4848-9f53-94e3d4f88b53': 47, '6ffb8ea9-2370-44d8-b678-e9237bbd347b': 86, '9c9f1380-2516-4fc9-a3e6-f9f61941d090': 145, 'eeb1195b-f213-4ce1-b28c-8565211f8e43': 708, '24ea074c-59cc-41c5-a5de-f68c2952965f'

In [15]:
# CALCULATE THE NUMBER OF SONGS WE ARE ESTIMATING IN OUR TRIAN_TEST SAMPLE
num_songs_estimating = 0

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        num_songs_estimating += 1
        
print num_songs_estimating

1037790


## KMeans - Let's Cluster the Users

In [114]:
KM = sklearn.cluster.KMeans(n_clusters=5, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=37)
# Calls fit and then predict
predict = KM.fit_predict(profile_matrix)

In [115]:
print "The objective function: %f" % KM.score(profile_matrix) 

The objective function: -211766.969515


In [109]:
# Examine the predicted clusters
print predict[1:10]

[6 8 4 0 8 9 9 7 6]


## Keep track of score

### Indicators for basic params
5 clusters: -4578883.573153
10 clusters: -948858.492895
20 clusters: -448359.382828
25 clusters:  -372704.076086

### Better indicator columns for age:
5 clusters: -211766.969515
10 clusters: -170864.058997
20 clusters: -129169.230617

## Test different techniques

### GLOBAL & PER-USER MEDIANS (GIVEN)

In [135]:
abs_error = 0

# TEST WITH GLOBAL and per-USER MEDIAN (GIVEN)
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
artist_plays_array = {}

for user, user_data in train_train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)
        
        if artist not in artist_plays_array:
            artist_plays_array[artist] = []
        
        artist_plays_array[artist].append(plays)
        
    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))
    
for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        if user in user_medians:
            prediction = user_medians[user]
            abs_error += abs(prediction - plays) 
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
MEAN ABSOLUTE ERROR 140.056850


## Given with Changes

In [134]:
abs_error = 0

# TEST WITH GLOBAL and per-USER MEDIAN (GIVEN)
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
artist_plays_array = {}

drop_ratio = .9

cluster_plays_array = {}

for user, user_data in train_train_data.iteritems():
    user_plays = []
    cluster_id = predict[user_pos_by_id[user]]
    if cluster_id not in cluster_plays_array:
        cluster_plays_array[cluster_id] = []
    
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)
        
        if artist not in artist_plays_array:
            artist_plays_array[artist] = []
        
        artist_plays_array[artist].append(plays)
        
    user_median = np.median(np.array(user_plays))
    user_medians[user] = user_median
    cluster_plays_array[cluster_id].append(user_median)
    
global_median = np.median(np.array(plays_array))
#global_mean = np.mean(np.array(plays_array))

cluster_ratios = {}
for cluster_id, cluster_data in cluster_plays_array.iteritems():
    cluster_ratios[cluster_id] = np.median(cluster_data) / global_median

artist_ratios = {}
for artist, artist_data in artist_plays_array.iteritems():
    #artist_ratios[artist] = np.median(artist_data) - global_median
    artist_ratios[artist] = np.median(artist_data) / global_median
    
for user, user_data in train_test_data.iteritems():
    cluster_id = predict[user_pos_by_id[user]]
    for artist, plays in user_data.iteritems():
        if user in user_medians:
            prediction = user_medians[user] * artist_ratios[artist] * drop_ratio * cluster_ratios[cluster_id]
            abs_error += abs(prediction - plays) 
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
MEAN ABSOLUTE ERROR 146.564527


## Co-Occurance

In [16]:
# Construct the co-occurance matrix
artist_plays_cooccurance = np.zeros((len(artist_pos_by_id), len(artist_pos_by_id)))

for user, user_data in train_train_data.iteritems():
    for artist_1, plays_1 in user_data.iteritems():
        for artist_2, plays_2 in user_data.iteritems():
            artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += plays_2 * 1.0 / plays_1

In [17]:
# Normalize!
for i in range(len(artist_pos_by_id)):
    norm_factor = artist_plays_cooccurance[i, i]
    for j in range(len(artist_pos_by_id)):
        artist_plays_cooccurance[i, j] /= norm_factor

In [18]:
print artist_plays_cooccurance[0:10,0:10]

[[  1.00000000e+00   3.07399955e-03   8.87521764e-03   0.00000000e+00
    1.43453312e-03   1.27317323e-02   0.00000000e+00   0.00000000e+00
    0.00000000e+00   7.16298438e-03]
 [  6.75186882e-04   1.00000000e+00   1.69967806e-03   0.00000000e+00
    5.69809866e-03   1.19556844e-02   1.34698276e-03   0.00000000e+00
    0.00000000e+00   7.89914463e-03]
 [  1.17247325e-02   7.46060576e-03   1.00000000e+00   0.00000000e+00
    8.04970373e-03   1.01068096e-02   0.00000000e+00   0.00000000e+00
    9.06927758e-04   1.44810334e-02]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    2.74996777e-03   1.80783336e-01   2.80500447e-02   2.28186664e-03
    0.00000000e+00   0.00000000e+00]
 [  2.59123300e-03   9.52609249e-03   5.98537119e-04   1.37321853e-03
    1.00000000e+00   8.43219707e-02   0.00000000e+00   0.00000000e+00
    8.43420645e-03   0.00000000e+00]
 [  9.02590526e-04   9.52158720e-04   8.95830228e-04   1.14171383e-02
    3.28545010e-03   1.00000000e+00   8.71032

In [22]:
abs_error = 0

plays_array  = []
user_medians = {}
user_total_plays = {}
artist_plays_array = {}

drop_ratio = 1.0

cluster_plays_array = {}

for user, user_data in train_train_data.iteritems():
    user_plays = []
    #cluster_id = predict[user_pos_by_id[user]]
    #if cluster_id not in cluster_plays_array:
    #    cluster_plays_array[cluster_id] = []
    
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)
        
        if artist not in artist_plays_array:
            artist_plays_array[artist] = []
        
        artist_plays_array[artist].append(plays)
        
    user_median = np.median(np.array(user_plays))
    user_medians[user] = user_median
    user_total_plays[user] = np.sum(user_plays)
#    cluster_plays_array[cluster_id].append(user_median)
    
global_median = np.median(np.array(plays_array))
#global_mean = np.mean(np.array(plays_array))

#cluster_ratios = {}
#for cluster_id, cluster_data in cluster_plays_array.iteritems():
#    cluster_ratios[cluster_id] = np.median(cluster_data) / global_median

artist_ratios = {}
for artist, artist_data in artist_plays_array.iteritems():
    #artist_ratios[artist] = np.median(artist_data) - global_median
    artist_ratios[artist] = np.median(artist_data) / global_median
    
itr = 0
    
# PREDICT!
for user, user_data in train_test_data.iteritems():
    itr += 1
    
    #cluster_id = predict[user_pos_by_id[user]]
    if user in user_total_plays:
        user_total = user_total_plays[user]
        user_train = train_train_data[user]
    
        for artist_to_predict, plays_to_predict in user_data.iteritems():
            prediction = 0
            for artist_cooccur, plays_cooccur in user_train.iteritems():
                prediction += (1.0 * plays_cooccur / user_total) * (plays_cooccur * artist_plays_cooccurance[artist_pos_by_id[artist_cooccur], artist_pos_by_id[artist_to_predict]])
        
            #prediction = user_medians[user] * artist_ratios[artist] * drop_ratio * cluster_ratios[cluster_id]
            abs_error += abs(prediction - plays_to_predict) 
            if itr < 10:
                print "User", user, "Actual: ", plays_to_predict, "Prediction: ", prediction

    else:
        print "User", user, "not in train_train data."
        for artist_to_predict, plays_to_predict in user_data.iteritems():
            abs_error += abs(global_median - plays_to_predict)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

User f283c15ed4180e686384dc1de2a5cbf5f95ae269 Actual:  3 Prediction:  1.13731897507
User f283c15ed4180e686384dc1de2a5cbf5f95ae269 Actual:  3 Prediction:  1.19945053006
User 5909125332c108365a26ccf0ee62636eee08215c Actual:  415 Prediction:  5.33860637449
User 5909125332c108365a26ccf0ee62636eee08215c Actual:  327 Prediction:  37.7821565346
User 5909125332c108365a26ccf0ee62636eee08215c Actual:  306 Prediction:  18.0026577854
User 5909125332c108365a26ccf0ee62636eee08215c Actual:  376 Prediction:  12.3594019268
User 0eae120959c04371c23af09abaf71305ab2a1b3c Actual:  306 Prediction:  2.46022292209
User 0eae120959c04371c23af09abaf71305ab2a1b3c Actual:  170 Prediction:  4.46785056636
User 0eae120959c04371c23af09abaf71305ab2a1b3c Actual:  248 Prediction:  7.35057333033
User 734f7337c7d33e99fa60a6361a5df8e3fb939ecf Actual:  259 Prediction:  6.76629462309
User 734f7337c7d33e99fa60a6361a5df8e3fb939ecf Actual:  304 Prediction:  7.54324370899
User 734f7337c7d33e99fa60a6361a5df8e3fb939ecf Actual:  187

In [57]:
# Construct the co-occurance matrix
artist_plays_cooccurance = np.zeros((len(artist_pos_by_id), len(artist_pos_by_id)))

for user, user_data in train_train_data.iteritems():
    for i, (artist_1, plays_1) in enumerate(user_data.iteritems()):
        for j, (artist_2, plays_2) in enumerate(user_data.iteritems()):
            if j > i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += plays_2 / plays_1
                artist_plays_cooccurance[artist_pos_by_id[artist_2], artist_pos_by_id[artist_1]] += plays_1 / plays_2
            elif j==i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_1]] += 1
            
# Normalize!
for i in range(len(artist_pos_by_id)):
    norm_factor = artist_plays_cooccurance[i, i]
    for j in range(len(artist_pos_by_id)):
        artist_plays_cooccurance[i, j] /= norm_factor

itr = 0
abs_error = 0
    
# PREDICT!
for user, user_data in train_test_data.iteritems():
    itr += 1
    
    #cluster_id = predict[user_pos_by_id[user]]
    if user in user_total_plays:
        user_total = user_total_plays[user]
        user_train = train_train_data[user]
        if itr < 10:
            print "------"
            print "User", user, "TRAIN: ", user_train
    
        for artist_to_predict, plays_to_predict in user_data.iteritems():
#            prediction = []
            prediction = 0
            for artist_cooccur, plays_cooccur in user_train.iteritems():
                #if plays_cooccur > 1:
                    #prediction.append(plays_cooccur * artist_plays_cooccurance[artist_pos_by_id[artist_cooccur], artist_pos_by_id[artist_to_predict]])
                prediction += 1.0*plays_cooccur/user_total * (plays_cooccur * artist_plays_cooccurance[artist_pos_by_id[artist_cooccur], artist_pos_by_id[artist_to_predict]])
        
            #if len(prediction) > 0:
            #    prediction = np.mean(prediction)#1.0 * prediction / user_total
            #else:
            #    prediction = 1.0
            w_prediction = user_medians[user] * .9 + prediction * .4
            #prediction = prediction# * user_medians[user]* artist_ratios[artist]
            #user_medians[user] * artist_ratios[artist] * drop_ratio * cluster_ratios[cluster_id]
            abs_error += abs(w_prediction - plays_to_predict) 
            if itr < 10:
                print "Actual: ", plays_to_predict, "Prediction: ", w_prediction, (prediction > plays_to_predict), "predict: ", prediction

    else:
        print "User", user, "not in train_train data."
        for artist_to_predict, plays_to_predict in user_data.iteritems():
            abs_error += abs(global_median - plays_to_predict)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

------
User f283c15ed4180e686384dc1de2a5cbf5f95ae269 TRAIN:  {'79239441-bfd5-4981-a70c-55c3f15c1287': 5, '020bfbb4-05c3-4c86-b372-17825c262094': 10, 'c3cceeed-3332-4cf0-8c4c-bbde425147b6': 8, 'c98d40fd-f6cf-4b26-883e-eaa515ee2851': 10, '7944ed53-2a58-4035-9b93-140a71e41c34': 3, '1a1cd7f3-e5df-4eca-bae2-2757c9e656b5': 3, 'c2c70ed6-5f10-445c-969f-2c16bc9a4c2e': 24, '2fa0d3ac-b64c-401a-b0a1-4915ba6cc157': 11, '092ca127-2e07-4cbd-9cba-e412b4ddddd9': 4, '83d91898-7763-47d7-b03b-b92132375c47': 66, 'd1353a0c-26fb-4318-a116-defde9c7c9ad': 3, 'e83144dd-bb95-49fe-b1dd-00bab25cca9e': 11, '847e8a0c-cc20-4213-9e16-975515c2a926': 4, 'd87e52c5-bb8d-4da8-b941-9f4928627dc8': 8}
Actual:  3 Prediction:  7.51435190079 False predict:  0.785879751963
Actual:  3 Prediction:  7.61676936799 False predict:  1.04192341998
------
User 5909125332c108365a26ccf0ee62636eee08215c TRAIN:  {'298909e4-ebcb-47b8-95e9-cc53b087fc65': 337, 'e6e879c0-3d56-4f12-b3c5-3ce459661a8e': 427, 'ba0d6274-db14-4ef5-b28d-657ebde1a396': 7

### Cooccurance by ratio to user median

In [74]:
abs_error = 0

plays_array  = []
user_medians = {}
user_means = {}
user_total_plays = {}
user_stds = {}
artist_plays_array = {}

cluster_plays_array = {}

for user, user_data in train_train_data.iteritems():
    user_plays = []
    #cluster_id = predict[user_pos_by_id[user]]
    #if cluster_id not in cluster_plays_array:
    #    cluster_plays_array[cluster_id] = []
    
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)
        
        if artist not in artist_plays_array:
            artist_plays_array[artist] = []
        
        artist_plays_array[artist].append(plays)
        
    user_median = np.median(np.array(user_plays))
    user_medians[user] = user_median
    user_means[user] = np.mean(user_plays)
    user_stds[user] = np.std(user_plays)
    user_total_plays[user] = np.sum(user_plays)

In [77]:
# Construct the co-occurance matrix - METHOD 1
artist_plays_cooccurance = np.zeros((len(artist_pos_by_id), len(artist_pos_by_id)))

for user, user_data in train_train_data.iteritems():
    user_median = user_medians[user]
    user_mean = user_means[user]
    user_std = user_stds[user]
    if user_std == 0:
        user_std = 1.0
    for i, (artist_1, plays_1) in enumerate(user_data.iteritems()):
        for j, (artist_2, plays_2) in enumerate(user_data.iteritems()):
            if j > i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += (1.0 * plays_2 - user_median) / user_std
                artist_plays_cooccurance[artist_pos_by_id[artist_2], artist_pos_by_id[artist_1]] += (1.0 * plays_1 - user_median) / user_std
            elif j==i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_1]] += 1#(1.0 * plays_1 - user_mean) / user_mean
            
# Normalize!
for i in range(len(artist_pos_by_id)):
    norm_factor = artist_plays_cooccurance[i, i]
    for j in range(len(artist_pos_by_id)):
        artist_plays_cooccurance[i, j] /= norm_factor

In [100]:
# Construct the co-occurance matrix - METHOD 2
artist_plays_cooccurance = np.zeros((len(artist_pos_by_id), len(artist_pos_by_id)))
user_cooccurance_counts = np.ones((len(artist_pos_by_id), len(artist_pos_by_id)))
user_cooccurance_counts = user_cooccurance_counts * .01

for user, user_data in train_train_data.iteritems():
    user_median = user_medians[user]
    user_mean = user_means[user]
    user_std = user_stds[user]
    if user_std == 0:
        user_std = 1.0
    for i, (artist_1, plays_1) in enumerate(user_data.iteritems()):
        for j, (artist_2, plays_2) in enumerate(user_data.iteritems()):
            if j > i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += (1.0 * plays_2 - user_mean) / user_std
                user_cooccurance_counts[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += 1.0
                
                artist_plays_cooccurance[artist_pos_by_id[artist_2], artist_pos_by_id[artist_1]] += (1.0 * plays_1 - user_mean) / user_std
                user_cooccurance_counts[artist_pos_by_id[artist_2], artist_pos_by_id[artist_1]] += 1.0
                
            elif j==i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_1]] += 1#(1.0 * plays_1 - user_mean) / user_mean
            
# Normalize!
for i in range(len(artist_pos_by_id)):
    for j in range(len(artist_pos_by_id)):
        artist_plays_cooccurance[i, j] /= user_cooccurance_counts[i,j]

In [106]:
itr = 0
abs_error = 0

drop_ratio = 1.0
    
# PREDICT!
for user, user_data in train_test_data.iteritems():
    itr += 1
    
    #cluster_id = predict[user_pos_by_id[user]]
    if user in user_total_plays:
        user_total = user_total_plays[user]
        user_train = train_train_data[user]
        if itr < 10:
            print "------"
            print "User", user, "TRAIN: ", user_train
    
        for artist_to_predict, plays_to_predict in user_data.iteritems():
#            prediction = []
            prediction = 0
            for artist_cooccur, plays_cooccur in user_train.iteritems():
                #if plays_cooccur > 1:
                    #prediction.append(plays_cooccur * artist_plays_cooccurance[artist_pos_by_id[artist_cooccur], artist_pos_by_id[artist_to_predict]])
                prediction += (plays_cooccur * artist_plays_cooccurance[artist_pos_by_id[artist_cooccur], artist_pos_by_id[artist_to_predict]])
        
            prediction = prediction / user_total
        
            #if len(prediction) > 0:
            #    prediction = np.mean(prediction)#1.0 * prediction / user_total
            #else:
            #    prediction = 1.0
            user_std = user_stds[user]
            if user_std == 0:
                user_std = 1.0
            w_prediction = user_means[user] * .75 + user_std * prediction * 1
            #prediction = prediction# * user_medians[user]* artist_ratios[artist]
            #user_medians[user] * artist_ratios[artist] * drop_ratio * cluster_ratios[cluster_id]
            abs_error += abs(w_prediction - plays_to_predict) 
            if itr < 10:
                print "Actual: ", plays_to_predict, "Prediction: ", w_prediction, ", Medians: ", user_medians[user], "predict: ", prediction, user_medians[user] * prediction

    else:
        print "User", user, "not in train_train data."
        for artist_to_predict, plays_to_predict in user_data.iteritems():
            abs_error += abs(global_median - plays_to_predict)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

------
User f283c15ed4180e686384dc1de2a5cbf5f95ae269 TRAIN:  {'79239441-bfd5-4981-a70c-55c3f15c1287': 5, '020bfbb4-05c3-4c86-b372-17825c262094': 10, 'c3cceeed-3332-4cf0-8c4c-bbde425147b6': 8, 'c98d40fd-f6cf-4b26-883e-eaa515ee2851': 10, '7944ed53-2a58-4035-9b93-140a71e41c34': 3, '1a1cd7f3-e5df-4eca-bae2-2757c9e656b5': 3, 'c2c70ed6-5f10-445c-969f-2c16bc9a4c2e': 24, '2fa0d3ac-b64c-401a-b0a1-4915ba6cc157': 11, '092ca127-2e07-4cbd-9cba-e412b4ddddd9': 4, '83d91898-7763-47d7-b03b-b92132375c47': 66, 'd1353a0c-26fb-4318-a116-defde9c7c9ad': 3, 'e83144dd-bb95-49fe-b1dd-00bab25cca9e': 11, '847e8a0c-cc20-4213-9e16-975515c2a926': 4, 'd87e52c5-bb8d-4da8-b941-9f4928627dc8': 8}
Actual:  3 Prediction:  8.09270817984 , Medians:  6.5 predict:  -0.0103884670325 -0.0675250357111
Actual:  3 Prediction:  8.15426466244 , Medians:  6.5 predict:  -0.00632291874464 -0.0410989718402
------
User 5909125332c108365a26ccf0ee62636eee08215c TRAIN:  {'298909e4-ebcb-47b8-95e9-cc53b087fc65': 337, 'e6e879c0-3d56-4f12-b3c5-3

### KMEANS CLUSTERING

In [81]:
abs_error = 0

cluster_artist_plays = {}

for user, user_data in train_train_data.iteritems():
    cluster_id = predict[user_pos_by_id[user]]
    for artist, plays in user_data.iteritems():
        if cluster_id not in cluster_artist_plays:
            cluster_artist_plays[cluster_id] = {}
        
        if artist not in cluster_artist_plays[cluster_id]:
            cluster_artist_plays[cluster_id][artist] = []
        
        cluster_artist_plays[cluster_id][artist].append(plays)

print "Finished setting up cluster author play dictionary, running test now"

# Precalculate the cluster-artist medians
cluster_artist_medians = {}
for cluster_id, cluster_data in cluster_artist_plays.iteritems():
    artist_medians = {}
    for artist, plays in cluster_data.iteritems():
        artist_medians[artist] = np.median(plays)
    cluster_artist_medians[cluster_id] = artist_medians

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        cluster_id = predict[user_pos_by_id[user]]
        
        if artist in cluster_artist_plays[cluster_id]:
            abs_error += abs(np.median(cluster_artist_plays[cluster_id][artist]) - plays)
        elif user in user_medians:
            abs_error += abs(user_medians[user] - plays)
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            #print "Cluster+Artist: ", cluster_id, artist, "not in train_train data."
            #abs_error += abs(0 - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

Finished setting up cluster author play dictionary, running test now
MEAN ABSOLUTE ERROR 199.005376


In [123]:
abs_error = 0

cluster_artist_plays = {}

# The median number of per user per cluster
cluster_user_plays = {}

# The cluster medians
cluster_medians = {}

user_medians = {}

drop_ratio = .8

for user, user_data in train_train_data.iteritems():
    cluster_id = predict[user_pos_by_id[user]]
    user_plays = []
    for artist, plays in user_data.iteritems():
        if cluster_id not in cluster_artist_plays:
            cluster_artist_plays[cluster_id] = {}
        
        if artist not in cluster_artist_plays[cluster_id]:
            cluster_artist_plays[cluster_id][artist] = []
        
        cluster_artist_plays[cluster_id][artist].append(plays)

        user_plays.append(plays)

    median = np.median(np.array(user_plays))
    user_medians[user] = median
    if cluster_id not in cluster_user_plays:
        cluster_user_plays[cluster_id] = []
    cluster_user_plays[cluster_id].append(median) 
    
# Calculate per-cluster user median
for cluster_id, user_plays in cluster_user_plays.iteritems():
    cluster_medians[cluster_id] = np.median(user_plays)
    
user_ratios = {}
    
# Calculate the user-cluster play ratios
for user_id, median_plays in user_medians.iteritems():
    cluster_id = predict[user_pos_by_id[user_id]]
    user_ratios[user_id] = 1.0 * median_plays / cluster_medians[cluster_id]
        
print "Finished setting up cluster author play dictionary, running test now"

# Precalculate the cluster-artist medians
cluster_artist_medians = {}
for cluster_id, cluster_data in cluster_artist_plays.iteritems():
    artist_medians = {}
    for artist, plays in cluster_data.iteritems():
        artist_medians[artist] = np.median(plays)
    cluster_artist_medians[cluster_id] = artist_medians

# PREDICT!
for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        cluster_id = predict[user_pos_by_id[user]]
        if artist in cluster_artist_plays[cluster_id]:
            user_ratio = 1.0
            if user in user_ratios:
                user_ratio = user_ratios[user]
            prediction = user_ratio * cluster_artist_medians[cluster_id][artist] * drop_ratio
            abs_error += abs(prediction - plays)
        elif user in user_medians:
            abs_error += abs(user_medians[user] - plays)
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

Finished setting up cluster author play dictionary, running test now
MEAN ABSOLUTE ERROR 142.834053


# MEAN ABSOLUTE ERROR

### Given - Per-User
Global + Per-User (Given): 140.056850 <br />

Global + Per-User \* Artist Ratio: 143.587490 <br />
Global + Per-User + Artist Diff: 144.839260 <br />
Global + Per-User + Artist Diff from Mean: 226.244328 <br />

Global + Per-User \* Artist Ratio \* .5: 167.470407 <br />
Global + Per-User \* Artist Ratio \* .8: 143.928421 <br />

Global + Per-User \* Artist Ratio \* Cluster Ratio \* .8: 148.776073 <br />
Global + Per-User \* Artist Ratio \* Cluster Ratio \* .9: 146.564527 <br />
Global + Per-User \* Artist Ratio \* Cluster Ratio \* 1.0: 146.716599 <br />
Global + Per-User \* Artist Ratio \* Cluster Ratio \* 1.1: 148.907204 <br />


### KMeans Basic
KMeans (10 clusters): 198.894616 <br />
KMeans (20 clusters): 199.005376 <br />

### Per-User Ratios
KMeans (5 clusters) + per-user ratio: 147.278504  <br />
KMeans (10 clusters) + per-user ratio: 148.038072 <br />
KMeans (20 clusters) + per-user ratio: 149.274154 <br />
KMeans (25 clusters) + per-user ratio: 149.667337 <br />
KMeans (50 clusters) + per-user ratio: 153.856717 <br />

### Better indicator Columns for Age
KMeans (5 clusters): 147.080268 <br />
KMeans (10 clusters): 148.013096 <br />
KMeans (20 clusters): 149.802176 <br />

### Drop Ratio
KMeans (5 clusters) - 0.5: 161.433577 <br />
KMeans (5 clusters) - 0.75: 143.388267 <br />
KMeans (5 clusters) - 0.78: 142.834053 <br />
KMeans (5 clusters) - 0.8: 142.630979 <br />
KMeans (5 clusters) - 0.85: 142.679850 <br />
KMeans (5 clusters) - 0.9: 143.476183 <br />
KMeans (5 clusters) - 1.0: 147.080268 <br />
KMeans (5 clusters) - 1.1: 152.970603 <br />

### Cooccurance
Basic Ratios: 230.245089 <br />
Basic Sums: 198.822865 <br />
Weighted .8 \* Medians + .2 \* Cooccurance: 140.940338 <br />
Weighted .9 \* Medians + .2 \* Cooccurance: 138.739097 <br />
Weighted .9 \* Medians + .3 \* Cooccurance: 138.727909 <br />

Medians - Weighted .9 \* Medians + .3 \* Cooccurance: 138.830714 <br />
Means - Weighted .9 \* Medians + .3 \* Cooccurance: 138.505556 <br />
Means - Weighted .9 \* Medians + .4 \* Cooccurance: 138.294710 <br />
// Means (1s removed) - Weighted .9 \* Medians + .4 \* Cooccurance: 138.295340 <br />

Ratios to User Median: 138.678506 <br />
Remove medians, medians + medians \* ratios: 139.824810 <br />
Remove medians, basic normalize, medians +  medians \* ratios: 138.717073 <br />

Remove means, means +  means \* ratios: 163.837442 <br />
Remove means, medians +  medians \* ratios: 138.998152 <br />
Remove means, medians +  means \* ratios: 138.867155 <br />

#### Standardizing
Remove means / var, medians +  var \* ratios: 203.059854 <br />
Remove means / var, means +  var \* ratios: 226.789675 <br />

Remove means / std, medians +  std \* ratios: 138.911128 <br />
Remove medians / std, medians +  std \* ratios: 138.840453 <br />

#### Coocurrance Method 2
Remove medians / std, .9 \* medians +  1.1 \* std \* ratios: 140.448489 <br />
Remove medians / std, 1.0 \* medians +  1.0 \* std \* ratios: 142.306458 <br />
Remove medians / std, 1.0 \* medians +  1.2 \* std \* ratios: 140.362853 <br />
Remove medians / std, .8 \* medians +  1.1 \* std \* ratios:  138.770764 <br />
Remove medians / std, .7 \* medians +  1.1 \* std \* ratios:  139.731552 <br />

Remove means / std, .8 \* medians +  1.1 \* std \* ratios: 141.149339 <br />
Remove means / std, .8 \* means +  1.1 \* std \* ratios: 137.624492 <br />
Remove means / std, .75 \* means +  1.1 \* std \* ratios: 136.966472 <br />
Remove means / std, .75 \* means +  1 \* std \* ratios: 136.661990 <br /> 
SUBMITTED - ACTUAL: 143.34759 <br />

Remove means / std, .7 \* means +  1.1 \* std \* ratios: 137.486371 <br />
Remove means / std, 1.0 \* means +  1.0 \* std \* ratios: 149.710529 <br />






## OUTPUT

In [107]:
plays_array  = []
user_medians = {}
user_means = {}
user_total_plays = {}
user_stds = {}
artist_plays_array = {}
cluster_plays_array = {}

for user, user_data in train_data.iteritems():
    user_plays = []
    
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)
        
        if artist not in artist_plays_array:
            artist_plays_array[artist] = []
        
        artist_plays_array[artist].append(plays)
        
    user_median = np.median(np.array(user_plays))
    user_medians[user] = user_median
    user_means[user] = np.mean(user_plays)
    user_stds[user] = np.std(user_plays)
    user_total_plays[user] = np.sum(user_plays)
    
# # Construct the co-occurance matrix
# artist_plays_cooccurance = np.zeros((len(artist_pos_by_id), len(artist_pos_by_id)))

# for user, user_data in train_data.iteritems():
#     user_median = user_medians[user]
#     user_mean = user_means[user]
#     user_std = user_stds[user]
#     if user_std == 0:
#         user_std = 1.0
#     for i, (artist_1, plays_1) in enumerate(user_data.iteritems()):
#         for j, (artist_2, plays_2) in enumerate(user_data.iteritems()):
#             if j > i:
#                 artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += (1.0 * plays_2 - user_median) / user_std
#                 artist_plays_cooccurance[artist_pos_by_id[artist_2], artist_pos_by_id[artist_1]] += (1.0 * plays_1 - user_median) / user_std
#             elif j==i:
#                 artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_1]] += 1
            
# # Normalize!
# for i in range(len(artist_pos_by_id)):
#     norm_factor = artist_plays_cooccurance[i, i]
#     for j in range(len(artist_pos_by_id)):
#         artist_plays_cooccurance[i, j] /= norm_factor

# Construct the co-occurance matrix - METHOD 2
artist_plays_cooccurance = np.zeros((len(artist_pos_by_id), len(artist_pos_by_id)))
user_cooccurance_counts = np.ones((len(artist_pos_by_id), len(artist_pos_by_id)))
user_cooccurance_counts = user_cooccurance_counts * .01

for user, user_data in train_data.iteritems():
    user_median = user_medians[user]
    user_mean = user_means[user]
    user_std = user_stds[user]
    if user_std == 0:
        user_std = 1.0
    for i, (artist_1, plays_1) in enumerate(user_data.iteritems()):
        for j, (artist_2, plays_2) in enumerate(user_data.iteritems()):
            if j > i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += (1.0 * plays_2 - user_mean) / user_std
                user_cooccurance_counts[artist_pos_by_id[artist_1], artist_pos_by_id[artist_2]] += 1.0
                
                artist_plays_cooccurance[artist_pos_by_id[artist_2], artist_pos_by_id[artist_1]] += (1.0 * plays_1 - user_mean) / user_std
                user_cooccurance_counts[artist_pos_by_id[artist_2], artist_pos_by_id[artist_1]] += 1.0
                
            elif j==i:
                artist_plays_cooccurance[artist_pos_by_id[artist_1], artist_pos_by_id[artist_1]] += 1
            
# Normalize!
for i in range(len(artist_pos_by_id)):
    for j in range(len(artist_pos_by_id)):
        artist_plays_cooccurance[i, j] /= user_cooccurance_counts[i,j]
    
# PREDICT!
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_total_plays:
                user_total = user_total_plays[user]
                user_train = train_data[user]

                prediction = 0
                for artist_cooccur, plays_cooccur in user_train.iteritems():
                    prediction += (plays_cooccur * artist_plays_cooccurance[artist_pos_by_id[artist_cooccur], artist_pos_by_id[artist]])

                prediction = prediction / user_total

                user_std = user_stds[user]
                if user_std == 0:
                    user_std = 1.0
                #w_prediction = user_medians[user] * .9 + user_std * prediction * 1.1
                w_prediction = user_means[user] * .75 + user_std * prediction * 1
                soln_csv.writerow([id, w_prediction])

            else:
                print "User", user, "not in train data."
                soln_csv.writerow([id, global_median])