In [1]:
import settings.config as cfg

dataset_folder = cfg.dataset_folder
preprocessed_dataset_folder = cfg.preprocessed_dataset_folder
#min_ratings_per_item = cfg.min_ratings_per_item

In [2]:
import pandas as pd
import pickle

ratings_df = pd.read_csv(dataset_folder+"/ratings.csv")
group_composition = pickle.load(open(dataset_folder+"/group_composition.pkl", "rb"))
display(group_composition)

{1: {'group_size': 4, 'group_members': [26323, 42775, 41651, 32327]},
 10: {'group_size': 4, 'group_members': [99855, 80934, 9488, 86840]},
 11: {'group_size': 4, 'group_members': [99355, 76869, 43978, 30600]},
 12: {'group_size': 4, 'group_members': [89066, 17384, 56579, 62404]},
 13: {'group_size': 4, 'group_members': [66485, 31561, 31976, 53265]},
 14: {'group_size': 2, 'group_members': [96341, 64074]},
 15: {'group_size': 2, 'group_members': [752, 73779]},
 16: {'group_size': 3, 'group_members': [43778, 9371, 71238]},
 17: {'group_size': 4, 'group_members': [58708, 74772, 6990, 29112]},
 18: {'group_size': 4, 'group_members': [87821, 45831, 48507, 66062]},
 2: {'group_size': 2, 'group_members': [12627, 88865]},
 20: {'group_size': 3, 'group_members': [24623, 57710, 48826]},
 22: {'group_size': 3, 'group_members': [49924, 66179, 45608]},
 23: {'group_size': 4, 'group_members': [64219, 87750, 30719, 73790]},
 24: {'group_size': 3, 'group_members': [65793, 45113, 21293]},
 25: {'group

In [3]:
# Check number of ratings per user
rating_per_user_df = ratings_df.groupby(['user']).count().sort_values(by="rating", ascending=False)
display(rating_per_user_df)

# Check number of ratings per item
rating_per_item_df = ratings_df.groupby(['item']).count().sort_values(by="rating", ascending=False)
display(rating_per_item_df)

Unnamed: 0_level_0,item,rating,rank
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
752,10,10,10
66485,10,10,10
70508,10,10,10
70463,10,10,10
70399,10,10,10
...,...,...,...
1829,6,6,6
72828,6,6,6
41962,6,6,6
13715,6,6,6


Unnamed: 0_level_0,user,rating,rank
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D1,282,282,282
D2,282,282,282
D3,282,282,282
D7,282,282,282
D8,282,282,282
D9,282,282,282
D10,270,270,270
D4,270,270,270
D5,270,270,270
D6,270,270,270


In [4]:
# compute group members' preference similarity at the group level
import numpy as np
import ast

def footrule_similarity(user_ratings, gr_composition):
    diversity = pd.DataFrame(columns = ['group_id', 'user_A', 'user_B', 'distance'])
    
    for group_id in gr_composition:
        group = gr_composition[group_id]
        group_size = group['group_size']
        members = group['group_members']
        j = 0
        while(j < group_size-1):
            k = j+1
            while(k <= group_size-1):
                ratings_A = np.array(user_ratings.loc[user_ratings['user'] == members[j], 'rating'])
                ratings_B = np.array(user_ratings.loc[user_ratings['user'] == members[k], 'rating'])
                div = np.sum(np.abs(ratings_A-ratings_B))
                new_row = pd.DataFrame([[group_id, members[j], members[k], div]], 
                                       columns=['group_id', 'user_A', 'user_B', 'distance'])
                diversity = pd.concat([diversity, new_row], axis=0, ignore_index=True)
                k+=1
            j+=1

    sim = diversity
    sim['similarity_score'] = (sim['distance'].max()-sim['distance'])/(sim['distance'].max()-sim['distance'].min())
    sim = sim[['group_id', 'user_A', 'user_B', 'similarity_score']]
    similarity = sim.groupby('group_id')['similarity_score'].mean().reset_index()
    similarity['similarity'] = "similar"
    similarity.loc[similarity.similarity_score < similarity['similarity_score'].median(),'similarity'] = 'divergent'
    display(similarity['similarity_score'].median())
    
    
    return similarity

def add_similarity_to_group_composition(gr_composition,similarity):
    composition = gr_composition
    for group_id in composition:
        group = composition[group_id]
        composition[group_id]['group_similarity'] = similarity.loc[similarity['group_id'] == group_id]['similarity'].values[0]

    return composition


In [5]:
similarity = footrule_similarity(ratings_df, group_composition)
new_gr_composition = add_similarity_to_group_composition(group_composition, similarity)
display(new_gr_composition)


0.5467836257309941

{1: {'group_size': 4,
  'group_members': [26323, 42775, 41651, 32327],
  'group_similarity': 'divergent'},
 10: {'group_size': 4,
  'group_members': [99855, 80934, 9488, 86840],
  'group_similarity': 'divergent'},
 11: {'group_size': 4,
  'group_members': [99355, 76869, 43978, 30600],
  'group_similarity': 'divergent'},
 12: {'group_size': 4,
  'group_members': [89066, 17384, 56579, 62404],
  'group_similarity': 'divergent'},
 13: {'group_size': 4,
  'group_members': [66485, 31561, 31976, 53265],
  'group_similarity': 'similar'},
 14: {'group_size': 2,
  'group_members': [96341, 64074],
  'group_similarity': 'similar'},
 15: {'group_size': 2,
  'group_members': [752, 73779],
  'group_similarity': 'divergent'},
 16: {'group_size': 3,
  'group_members': [43778, 9371, 71238],
  'group_similarity': 'divergent'},
 17: {'group_size': 4,
  'group_members': [58708, 74772, 6990, 29112],
  'group_similarity': 'divergent'},
 18: {'group_size': 4,
  'group_members': [87821, 45831, 48507, 66062],
 

In [6]:
pickle.dump(new_gr_composition, open(preprocessed_dataset_folder+"/group_composition.pkl", "wb"))