In [109]:
import openai
from openai.embeddings_utils import cosine_similarity
import pandas as pd
import numpy as np
import os
from __future__ import division
from scipy.stats import ttest_ind
import itertools

In [110]:
def get_simtoavg_list(df):
    embeddings = df['embedding'].apply(lambda x: np.array(x))
    embeddings = embeddings.to_numpy()
    average_embedding = np.mean(embeddings, axis=0)
        
    sim_to_avgs = []
    for e in embeddings:
        sim_to_avgs.append(cosine_similarity(e, average_embedding))
    
    return sim_to_avgs


def get_simtoavg_comp(df, revID1, revID2):
    
    df1 = df[(df['reviewerID']==revID1)]
    df2 = df[(df['reviewerID']==revID2)]

    simtoavgs1 = get_simtoavg_list(df1)
    simtoavgs2 = get_simtoavg_list(df2)

    tstat = ttest_ind(simtoavgs1, simtoavgs2)
    print('Similarity comparison between the two chosen ReviewerIDs:')
    print(tstat)


In [111]:
# pairwise similarity comparison

def get_pairwise_sim_list(df):
    
    embeddings = df['embedding'].apply(lambda x: np.array(x))
    embeddings = embeddings.to_numpy()
    
    pairs = list(itertools.combinations(embeddings, 2))
    
    pair_sims = []
    for p in pairs:
        pair_sims.append(cosine_similarity(p[0], p[1]))
        
    return pair_sims    

    
def get_pairwise_similarity_comp(df, revID1, revID2):
    
    df1 = df[(df['reviewerID']==revID1)]
    df2 = df[(df['reviewerID']==revID2)]

    pairsims1 = get_pairwise_sim_list(df1)
    pairsims2 = get_pairwise_sim_list(df2)

    tstat = ttest_ind(pairsims1, pairsims2)
    print('Similarity comparison between the two chosen ReviewerIDs:')
    print(tstat)


In [121]:
def get_ttest(df, comparison_type, revID1, revID2):
    
    if comparison_type == 'pairwise':
        get_pairwise_similarity_comp(df, revID1, revID2)
        
    if comparison_type == 'sim_to_avg':
        get_simtoavg_comp(df, revID1, revID2)

In [122]:
df = pd.read_pickle('practical_top50reviewIDs_embeddings.pkl')
comparison_type = 'sim_to_avg'
revID1 = 'B00178630A'
revID2 = 'B000W20LKK'

In [123]:
get_ttest(df, comparison_type, ratings_groups, revID1, revID2)

Ratings group 1 comparison between two products:
TtestResult(statistic=-1.1034053005359343, pvalue=0.27535243173073143, df=48.0)
Ratings group 2 comparison between two products:
TtestResult(statistic=2.920427420377149, pvalue=0.005311476823492506, df=48.0)


In [None]:
def pairwisesim_rating_bar(df):
    
    revID_review_counts = df['reviewerID'].value_counts()
    sorted_reviewers = revID_review_counts.sort_values(ascending=False).index
    top_10_reviewers = sorted_reviewers[:10]
    
    filtered_df = df[df['reviewerID'].isin(top_10_reviewers)]

    
    reviewID_pairsims = []
    
    for revID in top_10_reviewers:
        revID_df = filtered_df[(filtered_df['reviewerID']==revID)]
        pairsims = get_pairwise_sim_list(revID_df)
        reviewID_pairsims.append(np.mean(pairsims))
    
    plt_df = pd.DataFrame({'RevID':top_10_reviewers, 'val':reviewID_pairsims})
    
    title = 'Top 10 Reviewers, Pairwise Similarity'
    ax = plt_df.plot.bar(x='RevID', y='val', title=title, rot=0, legend=False)
    ax.bar_label(ax.containers[0])
    plt.xlabel('Reviewer ID')
    plt.ylabel('Mean Cosine Similarity')  

    plt.savefig('pairwise_sim_top10RevIDs.png')
    
    plt.show()

    

df = pd.read_pickle('practical_top50reviewIDs_embeddings.pkl')
pairwisesim_rating_bar(df)