In [1]:
def apk(actual, predicted, k=3):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    actual = list(actual)
    predicted = list(predicted)
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
import pandas as pd
import sys,os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

os.chdir('C:/Users/arpit.goel/Documents/Projects/Kaggle/24_RecommendationDesign')
df_train=pd.read_csv('01.RawData/train.csv')
df_test=pd.read_csv('01.RawData/test.csv')
df_chal=pd.read_csv('01.RawData/challenge_data.csv')
df_sample=pd.read_csv('01.RawData/sample_submission_J0OjXLi_DDt3uQN.csv')
df_chal['publish_date']=pd.to_datetime(df_chal['publish_date'],format='%d-%m-%Y')
df_chal.sort_values(by='publish_date',inplace=True)
df_chal['challenge_id']=np.arange(len(df_chal))

users=df_train[['user_id']].drop_duplicates()
np.random.seed(1234)
users['dtype']=np.where(np.random.rand(len(users))>0.9,'valid','train')
users=users.set_index('user_id')['dtype']
df_train['ds']=df_train['user_id'].map(users)
df_test['ds']='test'
master=pd.merge(pd.concat([df_train,df_test]),df_chal,left_on=['challenge'],right_on=['challenge_ID'])
master.sort_values(by=['user_id','challenge_sequence'],inplace=True)


In [3]:
def get_datasegment(master,valid=True):
    if valid==True:
        a1=master[master['ds'].isin(['train'])]
        a2=master[master['ds'].isin(['valid'])]
        a3=a2[a2['challenge_sequence']<=10]
        a4=a2[a2['challenge_sequence']>10]
        return a1,a3,a4
    else:
        return master[master['ds'].isin(['train','valid'])],master[master['ds'].isin(['test'])],[]

train,test,target=get_datasegment(master,valid=True)

In [70]:
train,test,target=get_datasegment(master,valid=True)
train_master=pd.concat([train,test])
challenge_seq=train_master['challenge'].value_counts()


In [71]:
challenge_seq.describe(percentiles=np.arange(0,100,10)/100.0)

count    5320.000000
mean      165.948684
std       560.523101
min         1.000000
0%          1.000000
10%         1.000000
20%         2.000000
30%         4.000000
40%         7.000000
50%        13.000000
60%        23.000000
70%        47.000000
80%       123.000000
90%       369.000000
max      9900.000000
Name: challenge, dtype: float64

In [15]:
a1=pd.concat([train,test])
a1['sample']=a1['user_id']%10

freq_items_lookup={}
for decay_factor in [0.75]:
    freq_items=[]
    group=a1[a1['sample']==1][['user_id','challenge_id','challenge_sequence']]
    
    for name,group in a1.groupby('sample'):
        a2=pd.merge(group,group,on=['user_id'])
        a2['wgt']=a2['challenge_sequence_x']-a2['challenge_sequence_y']
        a2['wgt']=np.where(a2['wgt']>0,decay_factor,0.75)**np.abs(a2['wgt'])
        summ=a2.groupby(['challenge_id_x','challenge_id_y'])['wgt'].sum().reset_index()
        freq_items.append(summ)

    freq_items=pd.concat(freq_items).groupby(['challenge_id_x','challenge_id_y'])['wgt'].sum()
    freq_items=freq_items.sort_values(ascending=False).to_frame('count_comb').reset_index()
    freq_items=freq_items[freq_items['count_comb']>20]
    freq_items_lookup[decay_factor]=freq_items

In [56]:
a1=pd.concat([train,test])
a1['sample']=a1['user_id']%10

freq_items=[]
group=a1[a1['sample']==1][['user_id','challenge_id','challenge_sequence']]
   
for name,group in a1.groupby('sample'):
    a2=pd.merge(group,group,on=['user_id'])
    a2=a2[a2['challenge_sequence_x']!=a2['challenge_sequence_y']]
    a2['wgt']=a2['challenge_sequence_x']-a2['challenge_sequence_y']
    a2['wgt']=np.where(a2['wgt']>0,0.7,0.8)**np.abs(a2['wgt'])
    summ=a2.groupby(['challenge_id_x','challenge_id_y'])['wgt'].sum().reset_index()
    freq_items.append(summ)
    
freq_items=pd.concat(freq_items).groupby(['challenge_id_x','challenge_id_y'])['wgt'].sum()
freq_items=freq_items.sort_values(ascending=False).to_frame('count_comb').reset_index()
freq_items=freq_items[freq_items['count_comb']>10]



In [62]:
freq_items.shape
t1=freq_items[freq_items['count_comb']>20]

In [69]:
from keras import backend as K
import os

def set_keras_backend(backend):

    if K.backend() != backend:
        os.environ['KERAS_BACKEND'] = backend
        reload(K)
        assert K.backend() == backend

set_keras_backend("theano")

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

ImportError: cannot import name np_utils

In [63]:
a1=pd.concat([train,test])
a1['sample']=a1['user_id']%100
freq_items_3=[]
group=a1[a1['sample']==1][['user_id','challenge_id','challenge_sequence']]
if 1==1:
# for name,group in a1.groupby('sample'):
    a2=pd.merge(group,group,on=['user_id'])
    a2=a2[a2['challenge_id_x']!=a2['challenge_id_y']]
    a2=pd.merge(a2,t1[['challenge_id_x','challenge_id_y']],on=['challenge_id_x','challenge_id_y'])
    a2['wgt']=a2['challenge_sequence_x']-a2['challenge_sequence_y']
    a2['wgt']=np.where(a2['wgt']>0,0.7,0.8)**np.abs(a2['wgt'])
    a3=pd.merge(a3,a3,on=['user_id','challenge_id_x'])
    a3=a3[a3['challenge_id_y_x']!=a3['challenge_id_y_y']]
    a3['wgt']=a3['wgt_x']*a3['wgt_y']
    summ=a3.groupby(['challenge_id_x','challenge_id_y_x','challenge_id_y_y'])['wgt'].sum().reset_index()
    freq_items_3.append(summ)
    
freq_items_3=pd.concat(freq_items_3).groupby(['challenge_id_x','challenge_id_y_x','challenge_id_y_y'])['wgt'].sum()
freq_items_3=freq_items_3.sort_values(ascending=False).to_frame('count_comb').reset_index()
# freq_items_3=freq_items_3[freq_items_3['count_comb']>10]


MemoryError: 

In [None]:


    
if 1==1:
# for name,group in a1.groupby('sample'):
    a2=pd.merge(group,group,on=['user_id'])
    a2=a2[a2['challenge_sequence_x']!=a2['challenge_sequence_y']]
    a2['wgt']=a2['challenge_sequence_x']-a2['challenge_sequence_y']
    a2['wgt']=np.where(a2['wgt']>0,0.7,0.8)**np.abs(a2['wgt'])
    summ=a2.groupby(['challenge_id_x','challenge_id_y'])['wgt'].sum().reset_index()
    a3=pd.merge(a2[['user_id','wgt','challenge_id_x','challenge_id_y']],summ[['challenge_id_x','challenge_id_y']],on=['challenge_id_x','challenge_id_y'])
    a3=pd.merge(a3,a3,on=['user_id','challenge_id_x'])
    a3=a3[a3['challenge_id_y_x']!=a3['challenge_id_y_y']]
    a3['wgt']=a3['wgt_x']*a3['wgt_y']
    summ2=a3.groupby(['challenge_id_x','challenge_id_y_x','challenge_id_y_y'])['wgt'].sum().reset_index()
#     freq_items.append(summ)
summ2.head()

In [52]:
summ2['wgt'].describe(percentiles=np.arange(0,100,10)/100.0)

count    5.752578e+06
mean     2.552593e-01
std      7.392829e-01
min      2.736875e-04
0%       2.736875e-04
10%      1.384129e-02
20%      2.880607e-02
30%      5.497558e-02
40%      8.235430e-02
50%      1.123942e-01
60%      1.616295e-01
70%      2.195200e-01
80%      3.276800e-01
90%      5.120000e-01
max      6.316373e+01
Name: wgt, dtype: float64

In [55]:
freq_items['count_comb'].describe(percentiles=np.arange(0,100,10)/100.0)

count    828520.000000
mean          4.526124
std          24.475960
min           0.013841
0%            0.013841
10%           0.117649
20%           0.240100
30%           0.343000
40%           0.490000
50%           0.640000
60%           0.800000
70%           1.233685
80%           2.215685
90%           5.932357
max        1615.844018
Name: count_comb, dtype: float64

In [16]:
predicted_challenges={}

for decay_factor in [0.75]:
    freq_items=freq_items_lookup[decay_factor]
    test['wgt']=test['challenge_sequence'].map(lambda x: 0.075**(10-x))
    for name,group in test[['user_id','challenge_id','challenge_sequence','wgt']].groupby(['user_id']):
        t1=pd.merge(group,freq_items,left_on=['challenge_id'],right_on=['challenge_id_x'])
        t1=t1[~t1['challenge_id_y'].isin(group['challenge_id'])]
        t1['count_comb']=t1['count_comb']*t1['wgt']
        predicted_challenges[name]=t1.groupby('challenge_id_y')['count_comb'].sum().sort_values(ascending=False).head(3).index.tolist()
        #predicted_challenges[name]=t1.groupby('challenge_id_y')['count_comb'].sum().sort_values(ascending=False).head(20).index.tolist()

    prediction=pd.Series(predicted_challenges)
    print decay_factor,mapk(actual[prediction.index],prediction)

0.75 0.160274969545


In [6]:
actual=target.groupby(['user_id'])['challenge_id'].apply(lambda x: list(x))



