In [2]:
%matplotlib inline
# Imports
import numpy as np
import csv
from sklearn import *
import pandas as pd
import matplotlib.pyplot as plt


# Predict via the user-specific median.
# If the user has no data, use the global median.

# Hard-code file names
train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'predictions.csv'
profiles_file = 'profiles.csv'
artists_file = 'artists.csv'

In [3]:
# Load the profile data.
profile_data = {}
user_ids = []

with open(profiles_file, 'r') as profile_fh:
    profile_csv = csv.reader(profile_fh, delimiter=',', quotechar='"')
    next(profile_csv, None)
    for row in profile_csv:
        # user,sex,age,country
        user    = row[0]
        sex     = row[1]
        age     = row[2]
        country = row[3]

        if age == '':
            age = -1
        if sex == '':
            sex = 'u'
    
        if not user in profile_data:
            profile_data[user] = {}
            user_ids.append(user)
        
        profile_data[user]['sex'] = sex
        profile_data[user]['age'] = int(age)
        profile_data[user]['country'] = country

print len(user_ids)

233286


In [5]:
print "Number of users in profiles: " + str(len(user_ids))
print ""
for user_id in user_ids[1:20]:
    print profile_data[user_id]

Number of users in profiles: 233286

{'country': 'Iceland', 'age': 29, 'sex': 'm'}
{'country': 'United States', 'age': 30, 'sex': 'm'}
{'country': 'Germany', 'age': 21, 'sex': 'm'}
{'country': 'Netherlands', 'age': 24, 'sex': 'm'}
{'country': 'United States', 'age': 22, 'sex': 'm'}
{'country': 'United States', 'age': -1, 'sex': 'f'}
{'country': 'Poland', 'age': -1, 'sex': 'f'}
{'country': 'United States', 'age': -1, 'sex': 'm'}
{'country': 'Ukraine', 'age': 16, 'sex': 'u'}
{'country': 'Italy', 'age': 44, 'sex': 'm'}
{'country': 'Russian Federation', 'age': 17, 'sex': 'm'}
{'country': 'Germany', 'age': 25, 'sex': 'm'}
{'country': 'Austria', 'age': 24, 'sex': 'm'}
{'country': 'Australia', 'age': 22, 'sex': 'f'}
{'country': 'Germany', 'age': 34, 'sex': 'm'}
{'country': 'United States', 'age': -1, 'sex': 'm'}
{'country': 'United States', 'age': 22, 'sex': 'm'}
{'country': 'Canada', 'age': 22, 'sex': 'm'}
{'country': 'Germany', 'age': -1, 'sex': 'm'}


In [3]:
# Create indicator variables
columns = ['i_MALE','i_FEMALE','AGE']
ages = []
for user_id in user_ids:
    country = profile_data[user_id]['country']
    if country not in columns:
        columns.append(country)
    ages.append(profile_data[user_id]['age'])

# Calc the mean age
mean_age = np.mean(ages)
    
# Construct matrix
profile_matrix = np.zeros((len(user_ids), len(columns)))
for i, user_id in enumerate(user_ids):
    profile = profile_data[user_id]

    # Create indicator variable for MALE
    if profile['sex'] == 'm':
        profile_matrix[i, 0] = 1
    # Create indicator variable for FEMALE    
    elif profile['sex'] == 'f':
        profile_matrix[i, 1] = 1
        
    # Add a 1 for the country indicator
    country = profile['country']
    country_col = columns.index(country)
    profile_matrix[i, country_col] = 1

    # TODO: Calculate median age, replace nulls with 
    # TODO: Separate ages into 5 year indicator columns
    
    # Add age
    profile_matrix[i, 2] = profile['age']

In [7]:
user_pos_by_id = {}
for i, user_id in enumerate(user_ids):
    user_pos_by_id[user_id] = i

In [5]:
# Examine the data
profile_matrix.shape
profile_matrix[1:10]

array([[  1.,   0.,  29., ...,   0.,   0.,   0.],
       [  1.,   0.,  30., ...,   0.,   0.,   0.],
       [  1.,   0.,  21., ...,   0.,   0.,   0.],
       ..., 
       [  0.,   1.,  -1., ...,   0.,   0.,   0.],
       [  1.,   0.,  -1., ...,   0.,   0.,   0.],
       [  0.,   0.,  16., ...,   0.,   0.,   0.]])

In [4]:
# Load the training data.
train_data = {}
train_user_ids = []

# TRAIN-TEST SPLIT FOR TESTING PURPOSES
# Need to split on a per-user-artist level, not just per-user level
train_train_data = {}
train_test_data = {}

with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
            train_user_ids.append(user)
        
        train_data[user][artist] = int(plays)
        
        # Build train and test split
        if np.random.uniform(0,1,1) < .75:
            if not user in train_train_data:
                train_train_data[user] = {}
            train_train_data[user][artist] = int(plays)
        else:
            if not user in train_test_data:
                train_test_data[user] = {}
            train_test_data[user][artist] = int(plays)

In [10]:
# Examine the data
print "Number of users in train: " + str(len(train_user_ids))
print ""
for user_id in train_user_ids[1:3]:
    print train_data[user_id]

Number of users in train: 233286

{'e3e0abcd-7671-4482-a9d8-462f5acc9be5': 64, '63011a8d-0117-4f7e-9991-1ef1f337ff70': 13, 'f4857fb9-e255-4dc6-bd01-e4ca7cc68544': 21, 'c485632c-b784-4ee9-8ea1-c5fb365681fc': 45, 'a96ac800-bfcb-412a-8a63-0a98df600700': 35, '8dd98bdc-80ec-4e93-8509-2f46bafc09a7': 23, '69837400-8e31-4949-aac2-00b46b4df126': 18, 'a3a92047-be1c-4f3e-8960-c4f8570984df': 81, '648615ca-ca74-460d-928a-2bae67ae6d14': 19, '0110e63e-0a9b-4818-af8e-41e180c20b9a': 22, '6ffb8ea9-2370-44d8-b678-e9237bbd347b': 56, '9fdaa16b-a6c4-4831-b87c-bc9ca8ce7eaa': 20, '5441c29d-3602-4898-b1a1-b77fa23b8e50': 70, '9bf79f68-c064-44a1-8c2c-5764f1d7c016': 27, '4a4ee089-93b1-4470-af9a-6ff575d32704': 31, '9efff43b-3b29-4082-824e-bc82f646f93d': 22}
{'8d18b680-368c-4649-a5e3-85e0c2dd6fc2': 51, 'a4a3048f-3968-4848-9f53-94e3d4f88b53': 47, '6ffb8ea9-2370-44d8-b678-e9237bbd347b': 86, '9c9f1380-2516-4fc9-a3e6-f9f61941d090': 145, 'eeb1195b-f213-4ce1-b28c-8565211f8e43': 708, '24ea074c-59cc-41c5-a5de-f68c2952965f'

In [13]:
# CALCULATE THE NUMBER OF SONGS WE ARE ESTIMATING IN OUR TRIAN_TEST SAMPLE
num_songs_estimating = 0

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        num_songs_estimating += 1
        
print num_songs_estimating

1039878


## KMeans - Let's Cluster the Users

In [19]:
from sklearn.cluster import KMeans
KM = KMeans(n_clusters=20, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=37)
# Calls fit and then predict
predict = KM.fit_predict(profile_matrix)

In [20]:
print "The objective function: %f" % KM.score(profile_matrix) 

The objective function: -448359.382828


In [21]:
# Examine the predicted clusters
print predict[1:10]

[ 5  5 12 15  3  0  0  0  6]


## Keep track of score

10 clusters
Indicators for basic params: -948858.492895

20 clusters
Indicatros for basic params: -448359.382828

In [12]:
## Train-Test Split
#tt_X, tt_Y = cross_validation.train_test_split(train_user_ids, random_state=37)
#print len(tt_X)
#print len(tt_Y)

174964
58322


In [57]:
#tt_X_data = {}
#for uid in tt_X:
#    tt_X_data[uid] = train_data[uid]
    
#tt_Y_data = {}
#for uid in tt_Y:
#    tt_Y_data[uid] = train_data[uid]
    
#print len(tt_X_data)
#print len(tt_Y_data)

174964
58322


## Test different techniques

### GLOBAL & PER-USER MEDIANS (GIVEN)

In [65]:
abs_error = 0

# TEST WITH GLOBAL and per-USER MEDIAN (GIVEN)
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        if user in user_medians:
            abs_error += abs(user_medians[user] - plays)
        else:
            print "User", user, "not in train_train data."
            abs_error += abs(global_median - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
User 1c38bd41dbb223394a0639fc401f47435a5d6aec not in train_train data.
MEAN ABSOLUTE ERROR 140.056850


### KMEANS CLUSTERING

In [None]:
abs_error = 0

cluster_artist_plays = {}

for user, user_data in train_train_data.iteritems():
    cluster_id = predict[user_pos_by_id[user]]
    for artist, plays in user_data.iteritems():
        if cluster_id not in cluster_artist_plays:
            cluster_artist_plays[cluster_id] = {}
        
        if artist not in cluster_artist_plays[cluster_id]:
            cluster_artist_plays[cluster_id][artist] = []
        
        cluster_artist_plays[cluster_id][artist].append(plays)

print "Finished setting up cluster author play dictionary, running test now"

# Precalculate the cluster-artist medians
cluster_artist_medians = {}
for cluster_id, cluster_data in cluster_artist_plays.iteritems():
    artist_medians = {}
    for artist, plays in cluster_data.iteritems():
        artist_medians[artist] = np.median(plays)
    cluster_artist_medians[cluster_id] = artist_medians

for user, user_data in train_test_data.iteritems():
    for artist, plays in user_data.iteritems():
        cluster_id = predict[user_pos_by_id[user]]
        
        if artist in cluster_artist_plays[cluster_id]:
            abs_error += abs(np.median(cluster_artist_plays[cluster_id][artist]) - plays)
        else:
            print "Cluster+Artist: ", cluster_id, artist, "not in train_train data."
            abs_error += abs(0 - plays)
            
print "MEAN ABSOLUTE ERROR %f" % (abs_error / num_songs_estimating)

Finished setting up cluster author play dictionary, running test now
Cluster+Artist:  13 1fa14a96-c25c-4bb7-b94d-ff453519eab3 not in train_train data.
Cluster+Artist:  13 3a54bffa-2314-44a2-927b-60144119c780 not in train_train data.
Cluster+Artist:  2 86b24e8f-a4d9-4c84-83ee-fde0d14ad9fa not in train_train data.
Cluster+Artist:  2 bbc5b66b-d037-4f26-aecf-0b129e7f876a not in train_train data.
Cluster+Artist:  4 e3434cc7-d348-491a-9dc8-325af3d9086d not in train_train data.
Cluster+Artist:  2 bdacc37b-8633-4bf8-9dd5-4662ee651aec not in train_train data.
Cluster+Artist:  2 82a5b152-ee60-4447-939a-dd5a91cd7c38 not in train_train data.
Cluster+Artist:  2 91d51bc3-bfcc-49f2-b6c4-d6d205d6291b not in train_train data.
Cluster+Artist:  13 d8d1b067-78bb-4db7-8f91-db2ff9a83ee5 not in train_train data.
Cluster+Artist:  13 61b99cc4-06cd-4ad5-9bac-950b2daef9cc not in train_train data.
Cluster+Artist:  9 7e30debb-3308-49ab-a5ac-6d49319a5705 not in train_train data.
Cluster+Artist:  14 b3c94036-6166-41

# MEAN ABSOLUTE ERROR

Global + Per-User (Given): 140.056850

KMeans (10 clusters): 198.894616
KMeans (20 clusters): 

## OUTPUT

In [30]:
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                soln_csv.writerow([id, user_medians[user]])
            else:
                print "User", id, "not in training data."
                soln_csv.writerow([id, global_median])
                

In [6]:
artists = {}
with open(artists_file, 'r') as af:
    artists_csv = csv.reader(af, delimiter=',', quotechar='"')
    next(af)
    for row in artists_csv:
        artists[row[0]] = row[1]

In [None]:
# RUN EVERYTHING FROM HERE. MAY BE ERRORS. TEXT ME WHEN THIS OCCURS. 

In [5]:
testdf = pd.read_csv(test_file).drop('Id',1)
traindf = pd.read_csv(train_file)

In [6]:
fulldf = traindf.append(testdf)

In [7]:
profilesdf = pd.read_csv(profiles_file)
profilesdf.head()

Unnamed: 0,user,sex,age,country
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25,Sweden
1,5909125332c108365a26ccf0ee62636eee08215c,m,29,Iceland
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30,United States
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21,Germany
4,02871cd952d607ba69b64e2e107773012c708113,m,24,Netherlands


In [8]:
# add profile about the user
mldf = traindf.merge(profilesdf)

In [9]:
# add mean plays per user
mean_plays = mldf.groupby('user').plays.mean().reset_index()
mean_plays.columns = ['user','mean_plays']
mldf = mldf.merge(mean_plays)

In [10]:
# add mean plays per artist
mean_plays_a = mldf.groupby('artist').plays.mean().reset_index()
mean_plays_a.columns = ['artist','mean_plays_artist']
mldf = mldf.merge(mean_plays_a)

In [12]:
# Create a seperate DF just for clustering. It will include the number of plays per artist, as well as demographics. 
udf = pd.DataFrame(train_data).transpose().reset_index()
udf = udf.fillna(0)
profilesdf.columns = ['index','sex','age','country']
udf = udf.merge(profilesdf)
# indicatorify sex
udf['male'] = 1*(udf.sex=='m')
udf['female'] = 1*(udf.sex=='f')
del udf['sex']
# add indicator column for countries with listens over 5000.
country_counts = udf.groupby('country').index.count()
countries_to_include = country_counts[country_counts>5000].index
for c in countries_to_include:
    udf['country_%s' % c] = 1*(udf.country == c)
del udf['country']
# set any ages outside of [5,115] range to the average of the rest
age_mean=np.mean(udf[udf.age<115][udf.age>5].age)
udf[udf.age>=115] = udf[udf.age>=115].assign(age = age_mean) 
udf[udf.age<=5] = udf[udf.age>=5].assign(age = age_mean) 
udf = udf.fillna(age_mean)

In [15]:
users = udf['index']
del udf['index']
# find user clusters  (minibatch is so much faster than regular batch kmeans)
from sklearn.cluster import MiniBatchKMeans
KM = MiniBatchKMeans(random_state=37)
u_clusters = KM.fit_predict(udf)

In [20]:
# add user cluster as a column
users_cluster = pd.DataFrame(u_clusters,users).reset_index()
users_cluster.columns = ['user','user_cluster']
mldf = mldf.merge(users_cluster)

In [21]:
# add user cluster as inidicator variables
for cluster in mldf.user_cluster.unique():
    mldf['user_cluster_%i' % cluster] = 1*(mldf.user_cluster == cluster)
del mldf['user_cluster']

In [22]:
mldf['male'] = 1*(mldf.sex=='m')
mldf['female'] = 1*(mldf.sex=='f')
del mldf['sex']

In [23]:
# add indicator column for countries with listens over 5000.
country_counts = mldf.groupby('country').user.count()
countries_to_include = country_counts[country_counts>5000].index
for c in countries_to_include:
    mldf['country_%s' % c] = 1*(mldf.country == c)
del mldf['country']

In [24]:
# set any ages outside of [5,115] range to the average of the rest
age_mean=np.mean(mldf[mldf.age<115][mldf.age>5].age)
mldf[mldf.age>=115] = mldf[mldf.age>=115].assign(age = age_mean) 
mldf[mldf.age<=5] = mldf[mldf.age>=5].assign(age = age_mean) 

In [25]:
# add genres as well
artists_genres = pd.read_csv('artists_tagged.csv')
mldf = mldf.merge(artists_genres)

In [None]:
from sklearn.cross_validation import train_test_split
regressiondf = mldf
# get rid of features not to be used for regression
del regressiondf['user']
del regressiondf['artist']
del regressiondf['country']
regressiondf = regressiondf.fillna(0)
traindf_1, validdf_1 = train_test_split(regressiondf,test_size = 0.3)

In [413]:
# get y test and train
y_train = traindf_1.plays
y_test = validdf_1.plays
del traindf_1['plays']
del validdf_1['plays']

In [418]:
# now plug in a model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 10)
rf.fit(traindf_1,y_train)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [420]:
y_pred = rf.predict(validdf_1)
from sklearn.metrics import mean_absolute_error
print mean_absolute_error(y_test,y_pred)

NameError: name 'y_valid' is not defined