In [None]:
import sys
import csv
import time
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from scipy import spatial
import datetime
import ipdb
#Here we are going to implement algorithm which was described in the NIPS article 
#"Collaborative Ranking With 17 Parameters"
#Input: target_list
#Output: 15 extracted statistical features


#pt - path to the directory of the provided dataset
#neigh - number of nearest neighbors 
#st - path to the directory where you want to save the file
def NIPS_alg(pt,neigh,st):
    target_list=pd.read_csv( str(pt)+"/1_target_list.csv" , sep=',' )
    print("number of uniq users_int",target_list.groupby("user_id").count().shape[0])
    print("number of uniq offers_int",target_list.groupby("item_id").count().shape[0])
    #convert structure [user_id,item_id,displayed,type of inter, week]  to [user_id	item_id	displayed	click	book	reply]
    sort=target_list.groupby(['user_id','item_id','displayed'])['interaction_type'].apply(lambda x: set(x)).reset_index()
    sort.interaction_type = sort.interaction_type.apply(list)
    sort=sort.set_index('user_id')
    sort['click']=0
    sort['book']=0
    sort['reply']=0
    #sort= u_id,it_id, int_type={1,2,3}
    sort['click']=sort['interaction_type'].apply(lambda x: 1 if any([i==1 for i in x])== True else 0 )
    sort['book']=sort['interaction_type'].apply(lambda x: 1 if any([i==2 for i in x])== True else 0 )
    sort['reply']=sort['interaction_type'].apply(lambda x: 1 if any([i==3 for i in x])== True else 0 )
    sort=sort[['item_id','displayed','click','book','reply']]
    
    matr=target_list.groupby(['user_id','item_id','displayed'])['interaction_type'].apply(lambda x: sum(list(set(x)))).reset_index() 
    #matr= u_id,it_id, int_type={6=1,2,3}{4=1,0,3}
    nmatr=pd.DataFrame.copy(matr)
    nmatr.interaction_type = np.where( nmatr.interaction_type > 1 , 1 , nmatr.interaction_type )
    
    person_u = list(sorted(nmatr.user_id.unique()))
    thing_u = list(sorted(nmatr.item_id.unique()))

    data = nmatr['interaction_type'].tolist()
    row = nmatr.user_id.astype('category', categories=person_u).cat.codes
    col = nmatr.item_id.astype('category', categories=thing_u).cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(person_u), len(thing_u)))

    model_kNN = NearestNeighbors(algorithm='auto',metric='cosine',n_neighbors=neigh,p=2).fit(sparse_matrix)
    distances,indices=model_kNN.kneighbors(sparse_matrix)
    indices[:,0] = range(0,indices.shape[0])
    for i in range(len(indices)):
        for j in range (len(indices[i])):
            indices[i][j]=person_u[indices[i][j]] 

    n_df=pd.DataFrame([indices[:,0], pd.Series(indices[:,1:].tolist())]).T.set_index(0)
    n_df.index.names = ['user_id']
    n_df.columns=['neighbors']
    matr=matr.set_index('user_id')
    matr=matr.join(n_df, how='outer')
    FINAL=[]
    for i in  tqdm_notebook(range (len(matr))):
        win=[]
        los=[]
        tie=[]
        WINLOSTIE=[]
        target_it=matr.iloc[i,0]
        target_us=matr.index[i]
        for j in range (len(matr.iloc[i,3])):
            win_counter=0
            los_counter=0
            tie_counter=0
            neigh_us=matr.iloc[i,3][j]
    #         print("neigh_us",neigh_us)
            target_list=matr[matr.index==neigh_us]
    #         print("list",target_list)
            if len(target_list[target_list['item_id']==target_it]['interaction_type'].values)==0:
                win_counter,los_counter,tie_counter=0,0,0
            else:
                target_score=int(target_list[target_list['item_id']==target_it]['interaction_type'])
                for score in target_list['interaction_type']:
                    if target_score == score:
                        tie_counter+=1
                    elif target_score >score:
                        win_counter+=1
                    else:
                        los_counter+=1
                tie_counter-=1 #because we compared target_score with itself  
                total_count=win_counter+tie_counter+los_counter
    #             ipdb.set_trace()
    #             print("neigh",neigh_us,"target_score",target_score,"marl",marl,"win",win_counter,"los",los_counter,"tie",tie_counter)
                win_counter=float(win_counter)/total_count
                los_counter=float(los_counter)/total_count
                tie_counter=float(tie_counter)/total_count
    #         print("neigh",neigh_us,"dev",total_count)    
            WINLOSTIE.extend((win_counter,los_counter,tie_counter))

        WINLOSTIE=np.reshape(WINLOSTIE, (-1, 3)).T
    #     print(WINLOSTIE)
        win.extend([round(np.mean(WINLOSTIE[0]),4),round(np.var(WINLOSTIE[0]),4),round(np.max(WINLOSTIE[0]),4),round(np.min(WINLOSTIE[0]),4),round(float(np.count_nonzero(WINLOSTIE[0]))/50,4)])
        los.extend([round(np.mean(WINLOSTIE[1]),4),round(np.var(WINLOSTIE[1]),4),round(np.max(WINLOSTIE[1]),4),round(np.min(WINLOSTIE[1]),4),round(float(np.count_nonzero(WINLOSTIE[1]))/50,4)])
        tie.extend([round(np.mean(WINLOSTIE[2]),4),round(np.var(WINLOSTIE[2]),4),round(np.max(WINLOSTIE[2]),4),round(np.min(WINLOSTIE[2]),4),round(float(np.count_nonzero(WINLOSTIE[2]))/50,4)])
        FINAL.extend((target_us,win,los,tie))
    FINAL=np.array(FINAL,dtype=object)    
    FINAL=np.reshape(FINAL,(-1,4))
    FINAL= pd.DataFrame(FINAL).set_index(0)
    FINAL.index.names = ['user_id']
    FINAL.columns = ['WIN', 'LOS','TIE']
    sort_last=pd.concat([sort, FINAL], axis=1)
    sort_last.to_csv(str(st)+"/final_sorted_50.csv")
    return sort_last

In [None]:
NIPS_alg("/Users/amirasarbaev/Desktop",15,"/Users/amirasarbaev/Desktop")

('number of uniq users_int', 5949)
('number of uniq offers_int', 27136)


A Jupyter Widget