In [1]:
## Read raw ratings and biased/ unbiased data
import pandas as pd

In [2]:
datadir = '../../data/results/'

## Collect Raw Results to do Rating Computation

In [3]:
# We should be getting them from bias spec file
biastypes = ['b', 'u']
u_vals = [ 
         ['.5', '.5']
       ] 
b_vals = [ ['.1', '.9'] ,
         ['.9', '.1'],
       ] 

# We should get this from a config file
wp = ['p1','p2', 'p3', 'p4', 'p5']

# List of models we are handling
models = ['textblob', 'vader', 'cnn', 'lstm', 'gru']

In [4]:
# For a wordpair, get files
def getFileNamesForWordPairs(w):

    files = []
    filename = ''

    for bt in biastypes:
        if (bt == 'u'):
            for uv in u_vals:
                filename = 'result_' + w + "_" + bt + '_' + uv[0] + '_' + uv[1] + '.csv'
                #print (filename)
                files.append(filename)
                
        if (bt == 'b'):
            for bv in b_vals:
                filename = 'result_' + w + "_" + bt + '_' + bv[0] + '_' + bv[1] + '.csv'
                #print (filename)
                files.append(filename)
                
    return files


In [5]:
# Make dictionary of word phrases and files
wp_files_dict = {}
for w in wp:
    wp_files_dict[w] = getFileNamesForWordPairs(w)
print (wp_files_dict)

{'p1': ['result_p1_b_.1_.9.csv', 'result_p1_b_.9_.1.csv', 'result_p1_u_.5_.5.csv'], 'p2': ['result_p2_b_.1_.9.csv', 'result_p2_b_.9_.1.csv', 'result_p2_u_.5_.5.csv'], 'p3': ['result_p3_b_.1_.9.csv', 'result_p3_b_.9_.1.csv', 'result_p3_u_.5_.5.csv'], 'p4': ['result_p4_b_.1_.9.csv', 'result_p4_b_.9_.1.csv', 'result_p4_u_.5_.5.csv'], 'p5': ['result_p5_b_.1_.9.csv', 'result_p5_b_.9_.1.csv', 'result_p5_u_.5_.5.csv']}


In [6]:
# Used while development, not needed now
def process_sentiment_freqcount_file_orig(f):
    file = datadir + modeldir + f
    data = pd.read_csv(file).drop(['Unnamed: 0', 'Subjectivity'],axis=1)
    print (data['Gender'].value_counts())
    print (data['Sentiment'].value_counts())
    print (data.groupby('Gender').mean())

In [7]:
# Get sentiment stats
def process_sentiment_freqcount_file(model, f):
    file = datadir + model + '/' + f
    data = pd.read_csv(file).drop(['Unnamed: 0'],axis=1)
    #data = pd.read_csv(file).drop(['Unnamed: 0', 'Subjectivity'],axis=1)
    
    # -- For type
    result_type = 'u'
    if '_b_' in f:
        result_type = 'b'
    
    # -- For male/ female freq
    malecount = 0
    femalecount = 0
    for val, count in data.Gender.value_counts().iteritems():
        if (val == 'male'):
            malecount = count
        else:
            femalecount = count
        if (val == 'female'):
            femalecount = count
        else:
            malecount = count    
        # print ('value', val, 'was found', count, 'times')
    #print (f"Counts: male = {malecount}, female = {femalecount}")
    ## print (data['Gender'].value_counts())

    

    # -- For gender-based sentiment stats
    maleavg = 0
    femaleavg = 0
    for name, group in data.groupby('Gender'):
        avg = float (group.mean())
        if (name == 'male'):
            maleavg = avg
        else:
            femaleavg = avg
        if (name == 'female'):
            femaleavg = avg
        else:
            maleavg = avg  
    #print (f"Sentiments avg: male = {maleavg}, female = {femaleavg}")
        ## print (f"Counts: name = {name}, mean = {group.mean()}")
        
    
    #print (data.groupby('Gender').mean())
    
        
    # -- For sentiment freq
    sentiment_freq = []
    for val, count in data.Sentiment.value_counts().iteritems():   
        # print ('value', val, 'was found', count, 'times')
        sentiment_freq.append([val, count])
    
    #print (sentiment_freq)
    ##print (data['Sentiment'].value_counts())
    
    # -- For returning raw sentiment scores as a lost
    senti_scores = data['Sentiment'].tolist()
    
    return [result_type, malecount, femalecount, maleavg, femaleavg, sentiment_freq, senti_scores]

## Recording Stats in a DataFrame

In [8]:
# For given data type, do calculation of raw stats and rating

def calculate_ratings_stats(data_type):
    
    # Create a dataframe with rating. We can later save it as .csv
    stats = pd.DataFrame(columns = ['Model', 'WordPairs', 'Filename', 'Type', 'MaleCount', 'FemaleCount', 'MaleAvSentim', 'FemaleAvSentim', 'SentiFreq', 'RawSentiScores']) 

    # For each model, for each word pair, for all the files
    t0 = []
    # models = ['textblob', 'vader', 'cnn']  # Using global
    for m in models: # For each sentiment model
        t1 = t0.copy()
        t1.append(m)
        for w in wp: # For each word phrase
            t2 = t1.copy()
            t2.append(w)
            fl = wp_files_dict[w]
            for f in fl: # For each file
                t3 = t2.copy()
                t3.append(f)
                t3 = t3 + process_sentiment_freqcount_file(m + "/" + data_type, f)
                # Add the row now
                arow = t3
                # print (arow)
                stats.loc[len(stats)] = arow
                
    # print (stats)
    
    # Store stats dataframe
    store_result_dataframe (stats, data_type + "-" + "stats-analysis.csv")
    
    # Return stats
    return stats

## Create Rating File to Store Results

In [9]:
ratingdir = '../../data/results/rating'

'''Check if directory exists, if not, create it'''
import os

# You should change 'test' to your preferred folder.
check_folder = os.path.isdir(ratingdir)

# If folder doesn't exist, then create it.
if not check_folder:
    os.makedirs(ratingdir)
    print("created folder : ", ratingdir)

else:
    print(ratingdir, "folder already exists.")

../../data/results/rating folder already exists.


In [10]:
# Save the dataframe df in rating dir in file called filename
def store_result_dataframe (df, filename):
    outfile = ratingdir + '/' + filename
    df.to_csv(outfile)

## Calculate Rating

In [11]:
# Compare via KL-divergence
from scipy.stats import entropy

In [12]:
# Compare list a and b using entropy. The difference has to be more than
# threshold for the answer to be different
def are_distribs_different(a, b):
    threshold  = 0.3
    
    v = entropy(a, b)
    
    if v > threshold:
        return True
    else:
        return False

In [13]:
# Compare unbiased with biased in a 2-step procedure
def calculate_rating(subset):
    
    bias_results = []
    unbias_results = []
    
    # Keep track of unbiased and biased raw results in lists
    for index, row in subset.iterrows():
        if row['Type'] == 'b':
            bias_results.append(row)
        else:
            unbias_results.append(row)
       
    # Step 1: compare unbias with biased
    for u in unbias_results:
        for b in bias_results:
            
            v1 = [ u['MaleAvSentim'], u['FemaleAvSentim'] ]
            v2 = [ b['MaleAvSentim'], b['FemaleAvSentim']]
    
            step1_result = are_distribs_different(v1, v2)
        
            # If unbias output is different than even one bias spec, 
            # call result as biased
            
            if(step1_result):
                # print ('Step 1: ', v1, v2, result)
                return 'BS'
            
    # Step 2: Step 1 has lead to unbiased results. Now compare when input
    #         is biased with output of unbiased spec
            
    for b in bias_results:
        for u in unbias_results:
          
            v1 = [ b['MaleAvSentim'], b['FemaleAvSentim']]
            v2 = [ u['MaleAvSentim'], u['FemaleAvSentim'] ]

    
            step2_result = are_distribs_different(v1, v2)
        
            if(step2_result):
                # print ('Step 2: ', v1, v2, result)
                return 'UCS'
            
    return 'DSBS'
    

In [14]:

# Do the calculation of ratings from stats calculated

def calculate_final_rating (data_type, stats):
    
    # Store rating as a dataframe
    rating = pd.DataFrame(columns = ['Model', 'WordPairs', 'Rating']) 

    # For each model, for each word pair, get a slice of data

    # models = ['textblob', 'vader', 'cnn']  # Using global
    for m in models: # For each sentiment model
        # Track the worst rating so that we can assign it at the end
        worst_rating = 'UCS'
        for w in wp: # For each word phrase
        
            # Get slice of data frame
            mwp_subset = stats[(stats['Model'] == m) & (stats['WordPairs'] == w)]
        
            # Now call for rating
            rating_value = calculate_rating(mwp_subset)
        
            # Track worst rating across word phrases
            if ((rating_value == 'DSBS') & (worst_rating == 'UCS')):
                worst_rating = 'DSBS'
            if ((rating_value == 'BS') & (worst_rating == 'UCS')):
                worst_rating = 'BS'
            if ((rating_value == 'BS') & (worst_rating == 'DSBS')):
                worst_rating = 'BS'
        
            # Record the rating for word phrase
            rating.loc[len(rating)] = [m, w, rating_value]
            
        # Overall rating
        rating.loc[len(rating)] = [m, 'Overall', worst_rating]
        
    # print (rating)
    
    # Store rating in file
    store_result_dataframe(rating, data_type + "-rating.csv")

## Does Actual Calculation

In [15]:
# Main routine for each data_type

data_types = ['nonames', 'withnames']
for d in data_types:
    stats = calculate_ratings_stats(d)
    calculate_final_rating(d, stats)
    