In [2]:
import pandas as pd
import numpy as np
import matplotlib.

In [5]:
artists = pd.read_csv('../data/artists.csv')

In [6]:
artists.columns

Index([u'artist', u'name'], dtype='object')

In [7]:
artists.describe()

Unnamed: 0,artist,name
count,2000,1990
unique,2000,1977
top,879a449d-506a-422c-9fb8-8af01d3d04ae,Ben Harper
freq,1,2


In [9]:
artists.head()

Unnamed: 0,artist,name
0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,Liars
1,69c4cc43-8163-41c5-ac81-30946d27bb69,CunninLynguists
2,7a2e6b55-f149-4e74-be6a-30a1b1a387bb,The Desert Sessions
3,7002bf88-1269-4965-a772-4ba1e7a91eaa,Glenn Gould
4,dbf7c761-e332-467b-b4d9-aafe06bbcf8f,G. Love & Special Sauce


In [10]:
profiles = pd.read_csv('../data/profiles.csv')

In [11]:
profiles.describe()

Unnamed: 0,age
count,188444.0
mean,24.5174
std,21.853296
min,-1337.0
25%,20.0
50%,23.0
75%,27.0
max,1002.0


In [8]:
profiles.head()

Unnamed: 0,user,sex,age,country
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25,Sweden
1,5909125332c108365a26ccf0ee62636eee08215c,m,29,Iceland
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30,United States
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21,Germany
4,02871cd952d607ba69b64e2e107773012c708113,m,24,Netherlands


In [13]:
train = pd.read_csv('../data/train.csv')

In [14]:
train.head()

Unnamed: 0,user,artist,plays
0,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,5a8e07d5-d932-4484-a7f7-e700793a9c94,554
1,44ce793a6cd9d20f13f4a576a818ef983314bb5d,a3a92047-be1c-4f3e-8960-c4f8570984df,81
2,da9cf3f557161d54b76f24db64be9cc76db008e3,eeb1195b-f213-4ce1-b28c-8565211f8e43,708
3,8fa49ab25d425edcf05d44bfc1d5aea895287d81,a1419808-65d3-4d40-998c-1a0bac65eabc,265
4,b85fcaef67d2669cd99b334b5e8c8705263db2cf,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,220


In [17]:
train_merge = train.merge(profiles,left_on='user', right_on='user', how='inner')

In [27]:
cols = train_merge.columns
for col in cols:
    print col,":",np.sum(np.array(train_merge.isnull()[col]))

user : 0
artist : 0
plays : 0
sex : 345378
age : 792303
country : 0


In [28]:
cols = profiles.columns
for col in cols:
    print col,":",np.sum(np.array(profiles.isnull()[col]))

user : 0
sex : 19535
age : 44842
country : 0


#### Generating the Metadata for the data generation

#### Very important to run the cells below to create metadata

In [18]:
train = pd.read_csv('../data/train.csv')
profiles = pd.read_csv('../data/profiles.csv')
train_merge = train.merge(profiles,left_on='user', right_on='user', how='inner')

In [82]:
unique_countries = sorted(pd.unique(train_merge['country']))
unique_sex = pd.unique(train_merge['sex'])
unique_sex = [x for x in unique_sex if str(x) != 'nan']
mean_age = np.mean(train_merge['age'])
max_age = 121.0

In [83]:
#Generating distribution for sex
sex_dist = []
for sex in unique_sex:
    sex_dist.append(float(len(train_merge[train_merge['sex'] == sex])))
sex_dist = sex_dist/np.sum(sex_dist)

In [84]:
metadata = {}
metadata['countries'] = unique_countries
metadata['sex'] = unique_sex
metadata['mean_age'] = mean_age
metadata['max_age'] = max_age
metadata['sex_dist'] = sex_dist

#### Function to vectorize the output

In [91]:
def create_vector(row,metadata=metadata):
    countries = metadata['countries']
    num_countries = len(countries)
    country_num = countries.index(row['country'])
    #initialization of empty country array
    country_array = np.zeros(num_countries)
    country_array[country_num] = 1.
    #adding to return list
    return_list = list(country_array)
    #adding a one hot vector for sex
    sexes = metadata['sex']
    num_sexes = len(metadata['sex'])
    if row['sex'] not in sexes:
        sex = np.random.choice(sexes, size=1, p=metadata['sex_dist'])[0]
    else:
        sex = row['sex']
    sex_num = sexes.index(sex)
    sex_array = np.zeros(num_sexes)
    sex_array[sex_num] = 1.
    #adding to return list
    return_list = return_list + list(sex_array)
    #checking for age
    lower_bound = 0
    upper_bound = 150
    age = row['age']
    if age < lower_bound or age > upper_bound:
        age = metadata['mean_age']/metadata['max_age']
    elif str(age) == 'nan':
        age = metadata['mean_age']/metadata['max_age']
    else:
        age = age/metadata['max_age']
    return_list = return_list + [age]
    return return_list

#### Example application of the code above

In [97]:
train_merge_test = train_merge.head()

In [98]:
train_merge_test['vec'] = train_merge_test.apply(create_vector, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [99]:
train_merge_test['vec']

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: vec, dtype: object