In [1]:
def apk(actual, predicted, k=3):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    actual = list(actual)
    predicted = list(predicted)
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
import pandas as pd
import sys,os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

os.chdir('C:/Users/arpit.goel/Documents/Projects/Kaggle/24_RecommendationDesign')
df_train=pd.read_csv('01.RawData/train.csv')
df_test=pd.read_csv('01.RawData/test.csv')
df_chal=pd.read_csv('01.RawData/challenge_data.csv')
df_sample=pd.read_csv('01.RawData/sample_submission_J0OjXLi_DDt3uQN.csv')
df_chal['publish_date']=pd.to_datetime(df_chal['publish_date'],format='%d-%m-%Y')
df_chal.sort_values(by='publish_date',inplace=True)
df_chal['challenge_id']=np.arange(len(df_chal))

In [29]:
users=df_train[['user_id']].drop_duplicates()
np.random.seed(1234)
users['dtype']=np.where(np.random.rand(len(users))>0.90,'valid','train')
users=users.set_index('user_id')['dtype']
df_train['ds']=df_train['user_id'].map(users)
df_test['ds']='test'
master=pd.merge(pd.concat([df_train,df_test]),df_chal,left_on=['challenge'],right_on=['challenge_ID'])
master.sort_values(by=['user_id','challenge_sequence'],inplace=True)


In [31]:
def get_datasegment(master,valid=True):
    if valid==True:
        a1=master[master['ds'].isin(['train'])]
        a2=master[master['ds'].isin(['valid'])]
        a3=a2[a2['challenge_sequence']<=10]
        a4=a2[a2['challenge_sequence']>10]
        return a1,a3,a4
    else:
        return master[master['ds'].isin(['train','valid'])],master[master['ds'].isin(['test'])],[]

train,test,target=get_datasegment(master,valid=True)

In [150]:
challenge_counts=train['challenge_id'].value_counts().head(13)
test_challenges=test.groupby(['user_id'])['challenge_id'].apply(lambda x: set(x)).to_frame('challenges')
predicted_challenges={}
for name,row in test_challenges.iterrows():
    predictions=challenge_counts.index.difference(row['challenges'])
    predicted_challenges[name]=challenge_counts[predictions].sort_values(ascending=False).head(3).values

In [152]:
prediction=pd.Series(predicted_challenges)
actual=target.groupby(['user_id'])['challenge_id'].apply(lambda x: list(x))
mapk(actual,prediction)

0.0

In [54]:
np.arange(60,101,10)/100.0

array([ 0.6,  0.7,  0.8,  0.9,  1. ])

In [62]:
a1=pd.concat([train,test])
a1['sample']=a1['user_id']%10

freq_items_lookup={}
for decay_factor in [0.75]:
    freq_items=[]
    group=a1[a1['sample']==1][['user_id','challenge_id','challenge_sequence']]
    
    for name,group in a1.groupby('sample'):
        a2=pd.merge(group,group,on=['user_id'])
        a2['wgt']=a2['challenge_sequence_x']-a2['challenge_sequence_y']
        a2['wgt']=np.where(a2['wgt']>0,0.75,decay_factor)**np.abs(a2['wgt'])
        summ=a2.groupby(['challenge_id_x','challenge_id_y'])['wgt'].sum().reset_index()
        freq_items.append(summ)

    freq_items=pd.concat(freq_items).groupby(['challenge_id_x','challenge_id_y'])['wgt'].sum()
    freq_items=freq_items.sort_values(ascending=False).to_frame('count_comb').reset_index()
    freq_items=freq_items[freq_items['count_comb']>20]
    freq_items_lookup[decay_factor]=freq_items

In [63]:
predicted_challenges={}

for decay_factor in [0.75]:
    freq_items=freq_items_lookup[decay_factor]
    test['wgt']=test['challenge_sequence'].map(lambda x: 0.075**(10-x))
    for name,group in test[['user_id','challenge_id','challenge_sequence','wgt']].groupby(['user_id']):
        t1=pd.merge(group,freq_items,left_on=['challenge_id'],right_on=['challenge_id_x'])
        t1=t1[~t1['challenge_id_y'].isin(group['challenge_id'])]
        t1['count_comb']=t1['count_comb']*t1['wgt']
        predicted_challenges[name]=t1.groupby('challenge_id_y')['count_comb'].sum().sort_values(ascending=False).head(3).index.tolist()
        #predicted_challenges[name]=t1.groupby('challenge_id_y')['count_comb'].sum().sort_values(ascending=False).head(20).index.tolist()

    prediction=pd.Series(predicted_challenges)
    print decay_factor,mapk(actual[prediction.index],prediction)

0.75 0.160274969545


In [12]:
actual=target.groupby(['user_id'])['challenge_id'].apply(lambda x: list(x))


In [10]:
valid_predictions

Unnamed: 0,challenge_id,user_id
0,1501,4582
1,2298,4582
2,769,4582


In [286]:
valid_predictions=[]
for key,value in predicted_challenges.items():
    t1=pd.Series(value).to_frame('challenge_id')
    t1['user_id']=key
    valid_predictions.append(t1)
valid_predictions=pd.concat(valid_predictions)

In [324]:
valid_predictions['rank']=valid_predictions.index
t1=pd.merge(valid_predictions,target[['user_id','challenge_id','challenge_sequence']],on=['user_id','challenge_id'],how='left')
t1['tgt']=t1['challenge_sequence'].notnull()
t1['publish_date']=t1['challenge_id'].map(df_chal.set_index('challenge_id')['publish_date'])
t1['max_publish_date']=t1['user_id'].map(test.groupby('user_id')['publish_date'].max())
t1['last_publish_date']=t1['user_id'].map(test[test['challenge_sequence']==10].set_index('user_id')['publish_date'])
t1['delta_date1']=(t1['publish_date']-t1['max_publish_date']).dt.days
t1['delta_date2']=(t1['publish_date']-t1['last_publish_date']).dt.days
t1.head()

Unnamed: 0,challenge_id,user_id,rank,challenge_sequence,tgt,publish_date,max_publish_date,last_publish_date,delta_date1,delta_date2
0,465,98305,0,,False,2004-09-15,2007-08-25,2006-08-02,-1074,-686
1,1296,98305,1,,False,2006-06-12,2007-08-25,2006-08-02,-439,-51
2,2128,98305,2,,False,2007-07-02,2007-08-25,2006-08-02,-54,334
3,1458,98305,3,,False,2006-07-27,2007-08-25,2006-08-02,-394,-6
4,1501,98305,4,,False,2006-09-26,2007-08-25,2006-08-02,-333,55


In [329]:
for th in range(-300,0,10):
    t2=t1[t1['delta_date1']>=th]
    t2['rank_new']=t2.groupby('user_id')['rank'].rank()
    print th,t2[t2['rank_new']<=3]['tgt'].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


-300 6592
-290 6578
-280 6535
-270 6515
-260 6505
-250 6508
-240 6508
-230 6497
-220 6502
-210 6489
-200 6463
-190 6377
-180 6348
-170 6342
-160 6334
-150 6320
-140 6306
-130 6307
-120 6301
-110 6297
-100 6282
-90 6232
-80 6226
-70 6206
-60 6204
-50 6206
-40 6210
-30 6092
-20 5932
-10 5528


In [331]:
for th in range(-600,0,50):
    t2=t1[t1['delta_date2']>=th]
    t2['rank_new']=t2.groupby('user_id')['rank'].rank()
    print th,t2[t2['rank_new']<=3]['tgt'].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


-600 7107
-550 7114
-500 7111
-450 7098
-400 7070
-350 7072
-300 7083
-250 7037
-200 7043
-150 7001
-100 6987
-50 6998


In [336]:
# t1.groupby('user_id')['tgt'].sum()
# 4582      4585      
master[master['user_id']==4582]


Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge,ds,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id,challenge_id
12460,4582_1,4582,1,CI23855,valid,CI23855,1,SI2468,20993.0,2006-07-24,AI563823,M,,29.0,1442
89260,4582_2,4582,2,CI24915,valid,CI24915,1,SI2545,7389.0,2007-07-02,AI563790,F,AOI100154,29.0,2126
27643,4582_3,4582,3,CI24917,valid,CI24917,1,SI2545,43409.0,2007-07-02,AI564588,F,AOI100581,66.0,2128
52093,4582_4,4582,4,CI23933,valid,CI23933,1,SI2468,15086.0,2006-07-27,AI563783,M,AOI100149,31.0,1458
40016,4582_5,4582,5,CI23663,valid,CI23663,1,SI2472,8897.0,2004-09-15,AI563724,M,AOI100098,45.0,465
101977,4582_6,4582,6,CI24958,valid,CI24958,1,SI2545,6842.0,2007-07-09,AI564617,F,AOI100085,41.0,2162
73635,4582_7,4582,7,CI23975,valid,CI23975,1,SI2462,9204.0,2005-01-24,AI563823,M,,61.0,633
2,4582_8,4582,8,CI23714,valid,CI23714,1,SI2477,14723.0,2006-06-12,AI563766,M,AOI100129,29.0,1296
172811,4582_9,4582,9,CI24953,valid,CI24953,1,SI2545,3341.0,2007-07-05,AI563708,M,AOI100085,46.0,2157
178196,4582_10,4582,10,CI24944,valid,CI24944,1,SI2545,3274.0,2007-07-04,AI563790,F,AOI100154,114.0,2152


In [339]:
pd.merge(valid_predictions[valid_predictions['user_id']==4582],df_chal,on='challenge_id')

Unnamed: 0,challenge_id,user_id,rank,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,1501,4582,0,CI24530,1,SI2501,6814.0,2006-09-26,AI563815,M,AOI100173,29.0
1,2298,4582,1,CI25135,1,SI2556,5446.0,2007-08-25,AI563754,M,AOI100129,61.0
2,769,4582,2,CI23691,1,SI2469,4823.0,2005-05-25,AI563749,M,AOI100125,33.0
3,469,4582,3,CI23848,1,SI2472,3395.0,2004-09-16,AI563867,M,AOI100204,61.0
4,772,4582,4,CI23769,1,SI2469,3796.0,2005-05-25,AI563754,M,AOI100129,46.0
5,2284,4582,5,CI25126,1,SI2556,4269.0,2007-08-21,AI563872,M,AOI100085,46.0
6,1495,4582,6,CI24527,1,SI2501,6726.0,2006-09-26,AI564348,F,AOI100032,46.0
7,1464,4582,7,CI23648,1,SI2468,2186.0,2006-07-31,AI563691,M,AOI100086,30.0
8,290,4582,8,CI24187,1,SI2463,2836.0,2003-08-07,AI564088,M,AOI100316,46.0
9,2164,4582,9,CI24968,1,SI2545,1401.0,2007-07-11,AI563778,F,AOI100145,30.0


In [267]:
output=[]
for key,value in predicted_challenges.items():
    output.append([str(int(key))+'_11',value[0] if len(value)>=1 else 1])
    output.append([str(int(key))+'_12',value[1] if len(value)>=2 else 2])
    output.append([str(int(key))+'_13',value[2] if len(value)>=3 else 3])
predicted_challenges_df=pd.DataFrame(output,columns=['user_sequence','challenge_id'])
predicted_challenges_df=pd.merge(predicted_challenges_df,df_chal[['challenge_id','challenge_ID']],on='challenge_id')
predicted_challenges_df['challenge']=predicted_challenges_df['challenge_ID']
predicted_challenges_df[['user_sequence','challenge']].to_csv('03.Submissions/01.FreqItemsets2.csv',index=False)

In [247]:
prediction=pd.Series(predicted_challenges)
actual=target.groupby(['user_id'])['challenge_id'].apply(lambda x: list(x))
mapk(actual,prediction)

0.10434293772391187

In [252]:
freq_items['count_comb'].describe()

count    154061.000000
mean         89.924997
std         239.549817
min          11.000000
25%          16.000000
50%          28.000000
75%          71.000000
max       15184.000000
Name: count_comb, dtype: float64

In [232]:
freq_items['user_id'].describe(percentiles=np.arange(0,100,5)/100.0)

count    395085.000000
mean         12.266373
std          56.891493
min           1.000000
0%            1.000000
5%            1.000000
10%           1.000000
15%           1.000000
20%           1.000000
25%           1.000000
30%           1.000000
35%           1.000000
40%           1.000000
45%           1.000000
50%           2.000000
55.0%         2.000000
60%           2.000000
65%           3.000000
70%           4.000000
75%           5.000000
80%           7.000000
85%          10.000000
90%          18.000000
95%          44.000000
max        2607.000000
Name: user_id, dtype: float64

In [223]:
freq_items

Unnamed: 0,challenge_id_x,challenge_id_y,user_id
25574,128,134,2607
80100,465,1442,2210
195888,1296,1442,2175
15816,69,71,2037
26323,134,135,1985
212994,1442,2128,1982
25575,128,135,1975
26011,132,133,1972
15543,68,71,1910
80024,465,1296,1906
