In [1]:
## Read raw ratings and biased/ unbiased data
import pandas as pd

In [2]:
datadir = '../../data/results/'
modeldir = 'textblob/'
#rawscorefile = 'result_p1_b_.1_.9.csv'
rawscorefile = 'result_p1_u_.5_.5.csv'
file = datadir + modeldir + rawscorefile

In [3]:
# Load data
data = pd.read_csv(file)

nRow, nCol = data.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the raw data file.')

INFO: There are 80 rows and 4 columns in the raw data file.


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Gender,Sentiment
0,0,my aunt feels angry.,female,-0.5
1,1,my aunt made me feel angry.,female,-0.5
2,2,I made my aunt feel angry.,female,-0.5
3,3,The situation makes my nephew feel angry.,male,-0.5
4,4,my aunt feels amazing.,female,0.6


In [5]:
data['Gender'].value_counts()

female    44
male      36
Name: Gender, dtype: int64

In [6]:
data['Sentiment'].value_counts()

 0.6    40
-0.5    40
Name: Sentiment, dtype: int64

## Trying out Chi-square

In [7]:
## From: https://machinelearningmastery.com/chi-squared-test-for-machine-learning/
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# contingency table
table = [[10, 20, 30],
        [6,  9,  17]]
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

[[10, 20, 30], [6, 9, 17]]
dof=2
[[10.43478261 18.91304348 30.65217391]
 [ 5.56521739 10.08695652 16.34782609]]
probability=0.950, critical=5.991, stat=0.272
Independent (fail to reject H0)
significance=0.050, p=0.873
Independent (fail to reject H0)


In [8]:
# contingency table
table = [[0.160000, 0.042667],
#        [0.080556,  0.025000]]
         [0.9,  0.9000]]
print(table)

[[0.16, 0.042667], [0.9, 0.9]]


In [9]:
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d, p=%0.3f' % (dof, p))
print(expected)
# interpret p-value
prob = 0.95
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

dof=1, p=0.036
[[0.10727046 0.09539654]
 [0.95272954 0.84727046]]
significance=0.050, p=0.036
Dependent (reject H0)


In [10]:
from scipy.stats import chisquare

In [11]:
#stat, p = chisquare([0.1, 0.9], [.9, 0.1])
#stat, p = chisquare([0.160000, 0.042667], [0.02676, 0.2333])
stat, p = chisquare([0.160000, 0.042667], [0.08055, 0.025])
print('stat=%.3f, p=%.3f' % (stat, p))

stat=0.091, p=0.763


In [12]:
# interpret test-statistic
prob = 0.95
alpha = 1.0 - prob
print('significance alpha=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

significance alpha=0.050, p=0.763
Independent (fail to reject H0)


## Try out KL-divergence

In [13]:
import numpy as np

def KL(P,Q):
    # Epsilon is used here to avoid conditional code for
    # checking that neither P nor Q is equal to 0. 
    epsilon = 0.00001

    # You may want to instead make copies to avoid changing the np arrays.
    P = P+epsilon
    Q = Q+epsilon

    divergence = np.sum(P*np.log(P/Q))
    return divergence



In [14]:

# Should be normalized though
values1 = np.asarray([0.16000000000000006, 0.042666666666666714]) # B1
values2 = np.asarray([0.026760563380281734, 0.2333333333333334]) # B2

print (KL(values1, values2))

0.21358560373129193


In [15]:

# Should be normalized though
values1 = np.asarray([0.16000000000000006, 0.042666666666666714]) # B1
# values2 = np.asarray([0.026760563380281734, 0.2333333333333334])
values2 = np.asarray([0.0805555555555556, 0.025000000000000043]) # U

print (KL(values1, values2))

0.13259868962630994


In [16]:
values1 = np.asarray([0.1, 0.9, 0]) # B1
values2 = np.asarray([0.9, 0.1, 0]) # B2

print (KL(values1, values2))

1.757708554708215


In [17]:
values1 = np.asarray([0.1, 0.9, 0]) # B1
values2 = np.asarray([0.5, 0.5, 0]) # U

print (KL(values1, values2))

0.3680539910115618


In [18]:
# Using Scipy - https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html
from scipy.stats import entropy

In [19]:
entropy([0.5, 0.5], qk=[0.9, 0.1])

0.5108256237659907

In [20]:
entropy([0.9, 0.1], qk=[0.5, 0.5])

0.3680642071684971

In [21]:
entropy([0.0805555555555556, 0.025000000000000043], [0.16000000000000006, 0.042666666666666714]) # U with B1

0.0020237453240181218

In [22]:
entropy([0.0805555555555556, 0.025000000000000043], [0.026760563380281734, 0.2333333333333334]) # U with B2

1.2138101918514779