In [15]:
import pandas as pd
from scipy import sparse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
def get_user_projects( user_index):
    known_user_likes = data_items_train.loc[user_index]
    known_user_likes = known_user_likes[known_user_likes > 0].index.values
    return known_user_likes

In [2]:
import numpy as np
def cast_to_list_of_int(lst):
    if lst is not np.nan:
        return [int(p) for p in ast.literal_eval(lst)]

In [4]:
import re
import ast
import pandas as pd
output = pd.read_csv('output_D2_scaled.csv')
output['recommended'] = output['recommended'].apply(lambda x : cast_to_list_of_int(x))
output['popularity'] = output['popularity'].apply(lambda x : cast_to_list_of_int(x))
output['general'] = output['general'].apply(lambda x : cast_to_list_of_int(x))
output['user_user'] = output['user_user'].apply(lambda x : cast_to_list_of_int(x))
output['item_item'] = output['item_item'].apply(lambda x : cast_to_list_of_int(x))
output['associations'] = output['associations'].apply(lambda x : cast_to_list_of_int(x))
output['content'] = output['content'].apply(lambda x : cast_to_list_of_int(x))

output['test'] = output['test'].apply(lambda x : re.sub(r'[\[\]]', '',x).split())

In [5]:
def precision_recall_at_k(k):
    results = []
    ip_addresses = ['']
    i=0
    for index, row in output.iterrows():
        precision = np.intersect1d(row['recommended'][:k], row['test']).size / len(row['recommended'][:k])
        recall = np.intersect1d(row['recommended'][:k], row['test']).size / len(row['test'])
        results.append([precision,recall])
    precisions = np.mean([i[0] for i in results if i[0]>=0])
    recalls = np.mean([i[1] for i in results if i[1]>=0])
    return  precisions, recalls
precision_recall_at_k(3)

(0.13078810180926095, 0.390360099881719)

In [7]:
def get_duplicate_values_in_dict(dictionary):
    flipped = {}
    to_return = []
    for key, value in dictionary.items():
        if value not in flipped:
            flipped[value] = [key]
        else:
            flipped[value].append(key)
    flipped = dict(sorted(flipped.items(), key=lambda item: item[1]))
    for key, value in flipped.items():
        if len(value) > 1:
            to_return.extend(value)
    return to_return


def get_best_order(idx, base_order=['item_item', 'user_user', 'content']):
    recs = output.iloc[idx]['recommended']
    i = 0
    best_order = {}
    for alg in base_order:
        project_list = output.iloc[idx][alg]
        if project_list:
            best_order[alg] = recs.index(project_list[0])
    best_order = dict(sorted(best_order.items(), key=lambda item: item[1]))
    return list(best_order.keys())


def get_kendal_lvinstain_from_ordered_list(idx, list_ordered):
    """
    :param idx:
    :param list_ordered:
    :return:
    """
    recs = output.iloc[idx]['recommended']
    explained_recs = []
    for alg in list_ordered:
        if output.iloc[idx][alg]:
            explained_recs.extend(output.iloc[idx][alg])
    if len(recs) != len(explained_recs):
        print("you have a bug!!!!")
#     kendal = (kendalltau(recs, explained_recs)[0] + 1) / 2
#     lvinstain = normalized_damerau_levenshtein_distance(recs, explained_recs)
    return explained_recs

def get_display_order_list(idx):
    output_order = ['associations']
    output_order.extend(get_best_order(idx))
    output_order.extend(get_best_order(idx,['popularity', 'general']))
    return get_kendal_lvinstain_from_ordered_list(idx, output_order)


def get_optimal_order_list(idx):
    """
    :param idx:
    :return: list_ordered
    """
    output_order = ['associations', 'content', 'item_item', 'user_user', 'popularity', 'general']
    output_order = get_best_order(idx, output_order)
    return get_kendal_lvinstain_from_ordered_list(idx, output_order)


def get_larger_size_order_list(idx):
    explain_algs = ['associations', 'content', 'item_item', 'user_user', 'popularity', 'general']
    explain_size_dict = {}
    for alg in explain_algs:
        if output.iloc[idx][alg]:
            explain_size_dict[alg] = len(output.iloc[idx][alg])
    explain_size_dict = dict(sorted(explain_size_dict.items(), reverse=True, key=lambda item: item[1]))
    duplicate_to_sort = get_duplicate_values_in_dict(explain_size_dict)
    explain_size_list = explain_size_dict.keys()
    explain_size_list = list(explain_size_list)
    # if we have the same size, we prefer the order that closest to the optimal
    if len(duplicate_to_sort) > 0:
        sorted_list = get_best_order(idx, duplicate_to_sort)
        opt = explain_size_list.copy()
        for i in range(len(duplicate_to_sort)):
            opt[explain_size_list.index(duplicate_to_sort[i])] = sorted_list[i]
        return get_kendal_lvinstain_from_ordered_list(idx, opt)
    return get_kendal_lvinstain_from_ordered_list(idx, explain_size_list)


def kendal_lvinstain(order_function):
    results = []
    for index, row in output.iterrows():
        kendal, lvinstain = order_function(index)
        results.append([kendal, lvinstain])
    kendal = np.mean([i[0] for i in results if i[0] >= 0])
    lvinstain = np.mean([i[1] for i in results if i[1] >= 0])
    return kendal, lvinstain


In [14]:
get_optimal_order_list(1)

[167, 172, 192, 98, 2774, 173, 75, 91, 97, 2992]

In [12]:
get_larger_size_order_list(1)

[167, 172, 192, 98, 75, 91, 97, 2992, 2774, 173]

In [8]:
get_display_order_list(1)

[173, 167, 172, 192, 98, 75, 91, 2774, 97, 2992]

In [10]:
output.iloc[1]['recommended']

[167, 172, 2774, 173, 75, 192, 97, 98, 91, 2992]

In [11]:
output.iloc[1]

general                                                   [2774]
item_item                                               [75, 91]
popularity                                            [97, 2992]
recommended     [167, 172, 2774, 173, 75, 192, 97, 98, 91, 2992]
test                                                        [91]
user                        89eea5f4-07ff-5a11-b546-5e66a1bfd3d3
user_user                                                   None
associations                                               [173]
content                                      [167, 172, 192, 98]
Name: 1, dtype: object

In [18]:
def add_method_user_index(lst,method, user, index):
    lst.insert(0,method)
    lst.insert(0,user)
    lst.insert(0,index)
    return lst


def get_project_explainer(user, project):
    for alg, lst in output[output['user']==user].iloc[0].to_dict().items():
        if isinstance(lst, list):
            if project in lst and alg !='recommended':
                return str(int(project)) + f"({alg[0]})"
get_project_explainer('96260f34-2fb3-567d-b3cc-eabb230b449d', 439)

'439(g)'

In [16]:
def get_data():
    df = pd.DataFrame()
    i = 0 
    for index, row, in output.iterrows():
        user = output.iloc[index]['user']
        # test:
        test = output.iloc[index]['test'].copy()
        test_series = pd.Series(add_method_user_index(test,'test',user, i))
        df = df.append(test_series, ignore_index=True)
        # alg
        recommended = output.iloc[index]['recommended'].copy()
        recommended_series = pd.Series(add_method_user_index(recommended,'recommended',user, i))
        df = df.append(recommended_series, ignore_index=True)
        # leader 
        leader = get_optimal_order_list(index)
        leader_series = pd.Series(add_method_user_index(leader,'leader',user, i))
        df = df.append(leader_series, ignore_index=True)
        # size 
        size = get_larger_size_order_list(index)
        size_series = pd.Series(add_method_user_index(size,'size',user, i))
        df = df.append(size_series, ignore_index=True)
        # explain_priority 
        exp_priority = get_display_order_list(index)
        exp_priority_series = pd.Series(add_method_user_index(exp_priority,'exp_priority',user, i))
        df = df.append(exp_priority_series, ignore_index=True)
        i+=1
    return df

In [None]:
test = get_data()
for i, row_value in test.iterrows():
    for col in test.columns[3:]:
        p = row_value[col]
        if not pd.isnull(p):
            user = row_value[1]
            test.loc[i,col] = get_project_explainer(user, p)
test

In [28]:
test.to_csv('explanations_order_D2_standardized.csv',index=False)

In [124]:
# print(f"kendalltau : {kendalltau([1,2],[2,1])}")
from sklearn.metrics import ndcg_score
print(ndcg_score([np.argsort([1,2,3,4,5,6,7,8,9,10])], [np.argsort([2,1,3,4,5,6,7,8,9,10])],ignore_ties=True))
print(ndcg_score([np.argsort([1,2,3,4,5,6,7,8,9,10])], [np.argsort([1,2,3,4,5,6,7,8,10,9])],ignore_ties=True))
print(ndcg_score([np.asarray([1,2,3,4,5,6,7,8,9,10])], [np.asarray([2,1,3,4,5,6,7,8,9,10])],ignore_ties=True))
print(ndcg_score([np.asarray([100,2,3,4,5,6,7,8,9,10])], [np.asarray([100,2,3,4,5,6,7,8,10,9])],ignore_ties=True))

# ndcg_score(np.argsort([1,2,3,4,5]), np.argsort([1,2,3,80,75]))

0.999529348181514
0.9854825637977803
0.9996007099471376
0.998939923043521


In [21]:
df = pd.DataFrame(columns=['Hybrid', 'size', 'leader','exp_priority'])
df.append({'foo':1, 'bar':2}, ignore_index=True)

Unnamed: 0,0,Hybrid,exp_priority,leader,size
0,1.0,,,,
1,2.0,,,,
2,3.0,,,,
3,4.0,,,,


In [12]:
a = pd.DataFrame()