# SA validation - imdb (2021-12-10)

_by A. Maurits van der Veen_  
_modification history:_  
_2018-11-01 - Initial clean-up; prep for bootstrapping_  
_2020-06-10 - Update to include negation processing_  
_2021-12-01 - Compare newer versions of lexica to older ones_  

Tests of MLS sentiment analysis approach against the imdb 'gold standard' corpus.
Some pathnames may need to be adjusted to make this work, but all the files are available in the MLS Github repository.

### 1. Setup


In [16]:
projectfolder = '/Users/xxx/Replication/'
corpusfilestem = projectfolder + 'IMDB corpus (table 3-5 & S7)/imdb'  # extension: '_valencedata.csv'

In [2]:
import sys

sys.path.append(projectfolder + 'Code')  
import sentiment_eval

from datetime import datetime
import pandas as pd
import numpy as np
import random

# Print summary version info (for fuller info, simply print sys.version)
print('You are using python version {}.'.format(sys.version.split()[0]))

You are using python version 3.10.12.


In [3]:
def load_scaler_fromcsv(filename, featurenames=(), includevar=False, displayinfo=True):
    """Load a calibration scaler from a text file

    Textfile will contain:
    - header line, containing scaler name, number of observations seen,
      number of features (N), standard deviation adjustment, and descriptor string
    - N rows of feature information, containing name, mean, standard deviation, and variance (if includevar)

    Function returns sklearn StandardScaler object,
                     number of features used, number of features available,
                     standard deviation adjustment, and descriptor

    If displayinfo is True, print information about the contents of the scaler before returning

    Note that the standard deviation adjustment is valid only if all features are used
    (i.e. number of features used == number of features available)
    """
    import csv
    import numpy as np
    from sklearn.preprocessing import StandardScaler

    # Read data from file
    with open(filename, 'r', encoding='utf-8', errors='ignore') as scalerfile:
        scalercsv = csv.reader(scalerfile)

        # Read & parse header
        headerrow = next(scalercsv)
        name = headerrow[0]
        nrfeatures = int(headerrow[2])
        stdev_adj = float(headerrow[3])
        descriptor = headerrow[4]

        # Read & parse individual features
        nrfeaturesused = 0
        featuresused, means, stdevs, variances = [], [], [], []
        for row in scalercsv:
            if len(featurenames) == 0 or (len(featurenames) > 0 and row[0] in featurenames):
                featuresused.append(row[0])
                nrfeaturesused += 1
                means.append(float(row[1]))
                stdevs.append(float(row[2]))
                if includevar:
                    variances.append(float(row[3]))

    # Initialize scaler
    newscaler = StandardScaler()
    newscaler.n_samples_seen_ = int(headerrow[1])
    newscaler.mean_ = np.array(means)
    newscaler.scale_ = np.array(stdevs)
    newscaler.var_ = np.array(variances)

    if displayinfo:
        print("Descriptor:", descriptor)
        print("Lexica used ({}): {}".format(nrfeaturesused, featuresused))
        print("Means:", newscaler.mean_)
        print("Std. devs.:", newscaler.scale_)
        print("Std. dev. of average across lexica (calculated using {} lexica): {}".format(nrfeatures, stdev_adj))

    return newscaler, featuresused, nrfeaturesused, nrfeatures, stdev_adj, descriptor


In [19]:
# Identify negaters

# The standard SO-CAL list (we derive our modification/intensification approach from them):
# ["not", "no", "n't", "neither", "nor", "nothing", "never", "none", 
#  "lack", "lacked", "lacking", "lacks", "missing", "without", "absence", "devoid"]

# In our standard list we do not include "n't" since we handle that in text preprocessing.
# We do add 3 'no...' words that are also negating in effect. 
# In addition, we have added 'absence_of', 'devoid_of' and 'lack_of' to our modifier dictionary
negaters = ('not', 'no', 'neither', 'nor', 'nothing', 'never', 'none', 
            'nowhere', 'noone', 'nobody',
            'lack', 'lacked', 'lacking', 'lacks', 'missing', 'without')


In [15]:
# Calibration file pathnames (include extension)

SAfolder = projectfolder + 'Github/MultiLexScaled/'
calibrationfolder = SAfolder + 'Scalers/'

calibrationfile_US = calibrationfolder + 'Calibration_US_2021-12-10.csv'
calibrationfile_UK = calibrationfolder + 'Calibration_UK_2021-12-10.csv'
calibrationfile_USUK = calibrationfolder + 'Calibration_USUK_2021-12-10.csv'


In [5]:
# Specify lexica

lexica = {'HuLiu':          SAfolder + 'HuLiu/opinion-lexicon-English/HuLiu_lexiconX.csv',
          'LabMT_filtered': SAfolder + 'labMT/labMT_lexicon_filtered.csv',
          'LexicoderSD':    SAfolder + 'Lexicoder/LSDaug2015/LSD_lexiconX.csv',
          'MPQA':           SAfolder + 'MPQA 2.0/opinionfinderv2.0/lexicons/MPQA_lexicon.csv',
          'NRC':            SAfolder + 'NRC/NRC-Emotion-Lexicon-v0.92/NRC_lexicon.csv',
          'SOCAL':          SAfolder + 'SO-CAL/English (from GitHub)/SO-CAL_lexiconX.csv',
          'SWN_filtered':   SAfolder + 'SWN/SWN_lexicon_filtered0.1.csv',
          'WordStat':       SAfolder + 'WordStat/WSD 2.0/WordStat_lexicon2X.csv',
         } 
lexnames = sorted(lexica.keys())


In [14]:
# Specify means & standard deviation data used in calibration, by extracting from the calibraton files

neutralscaler_US, featurenames, nrfeatures, nravailable, stdev_adj_US, descriptor = \
        load_scaler_fromcsv(calibrationfile_US, includevar=True, displayinfo=True)
USm = neutralscaler_US.mean_
USs = neutralscaler_US.scale_
print('\nUS means:', USm)
print('US stdevs:', USs)
print('US final stdev:', stdev_adj_US, '\n')

neutralscaler_UK, featurenames, nrfeatures, nravailable, stdev_adj_UK, descriptor = \
        load_scaler_fromcsv(calibrationfile_UK, includevar=True, displayinfo=True)
UKm = neutralscaler_UK.mean_
UKs = neutralscaler_UK.scale_
print('\nUK means:', UKm)
print('UK stdevs:', UKs)
print('UK final stdev:', stdev_adj_UK, '\n')

neutralscaler_USUK, featurenames, nrfeatures, nravailable, stdev_adj_USUK, descriptor = \
        load_scaler_fromcsv(calibrationfile_USUK, includevar=True, displayinfo=True)
USUKm = neutralscaler_USUK.mean_
USUKs = neutralscaler_USUK.scale_
print('\nUSUK means:', USUKm)
print('USUK stdevs:', USUKs)
print('USUK final stdev:', stdev_adj_USUK, '\n')


Descriptor: New scaler for US based on 48283 texts. Generated: 2021-12-10 11:12:39.832747
Lexica used (8): ['HuLiu', 'LabMT_filtered', 'LexicoderSD', 'MPQA', 'NRC', 'SOCAL', 'SWN_filtered', 'WordStat']
Means: [ 3.33755708e-03  1.89320002e-01  1.21356846e-02  7.69971106e-03
  2.26526518e-02  3.71012146e-02  1.27342660e-03 -1.64595405e-04]
Std. devs.: [0.01931773 0.08943294 0.02625014 0.01699521 0.02416683 0.05037926
 0.00624028 0.03290689]
Std. dev. of average across lexica (calculated using 8 lexica): 0.8486667938382454

US means: [ 3.33755708e-03  1.89320002e-01  1.21356846e-02  7.69971106e-03
  2.26526518e-02  3.71012146e-02  1.27342660e-03 -1.64595405e-04]
US stdevs: [0.01931773 0.08943294 0.02625014 0.01699521 0.02416683 0.05037926
 0.00624028 0.03290689]
US final stdev: 0.8486667938382454 

Descriptor: New scaler for UK based on 59404 texts. Generated: 2021-12-10 21:21:26.618950
Lexica used (8): ['HuLiu', 'LabMT_filtered', 'LexicoderSD', 'MPQA', 'NRC', 'SOCAL', 'SWN_filtered', 'Wo

### 2. Performance analysis: imdb

We have calculated valence using MLS for 4 different conditions: no modifiers, negators only, intensifiers only, and 'standard' (both negators and intensifiers). This allows us to compare how these compare relative to one another.


In [17]:
valencefile_combined = corpusfilestem + '_valencedata.csv'


#### 2.1 Assess performance of individual lexica & different scalers

Measure classification performance using cuts at:
- 0
- optimal for this lexicon (& given neg/mod setting)
- sets of specified cut-points 
  (US 1996-2015, UK translated 1996-2015, USUK)
  
Compare results for the translated & untranslated imdb data.


In [20]:
# Different SA parameters
jobspeclist = [('', True, negaters),
               ('_negonly', False, negaters),
               ('_modsonly', True, ()),
               ('_none', False, ())]


#### This output is for tables 3 & 4 in the paper, plus S7 in the supplementary info.

Individual lexica in the first table; fourth, fifth, and first data columns for our calibration, corpus calibration, and optimal calibration, respectively.

Overall performance, use the USUK averaging. Note that using US averaging gets marginally better results -- perhaps  because most of the movie reviews written by Americans?!

For performance in sets of articles (table 3), use the USUK averaging too.

For the appendix, look at the job specs `negonly` and `none`.

In [21]:
nrsetups = len(jobspeclist)
nrlex = len(lexnames)

supplied_data = [('US', USm, USs),
                 ('UK', UKm, UKs),
                 ('USUK', USUKm, USUKs)]

lexcols2include = list(range(nrlex))

for counter, jobspec in enumerate(jobspeclist):
    print('\n********* Working on jobspec:', jobspec)
    sentiment_eval.assess_perf(valencefile_combined, supplied_data, nrlex=nrlex,
                               firstvalencecol=6 + counter*nrlex, valcol=1, lengthcol=5, ratingscol=3,
                               calc_avgs=(lexcols2include,),
                               addcorpusmeans=True, calc_optimal=True, lexperf=True, bootstrap=(True if counter == 0 else False))


********* Working on jobspec: ('', True, ('not', 'no', 'neither', 'nor', 'nothing', 'never', 'none', 'nowhere', 'noone', 'nobody', 'lack', 'lacked', 'lacking', 'lacks', 'missing', 'without'))
Working with 25000 negative and 25000 positive texts.

Results for selected data
By measure, showing performance (cut-point), in order: optimal, US, UK, USUK, corpus
percent correct                    HuLiu: 74.33(0.000); 73.99(0.003); 74.01(0.003); 74.00(0.003); 73.94(0.002)
percent correct           LabMT_filtered: 69.59(0.221); 68.10(0.189); 67.84(0.187); 67.97(0.188); 69.55(0.219)
percent correct              LexicoderSD: 72.76(0.010); 72.57(0.012); 72.67(0.011); 72.61(0.012); 72.59(0.008)
percent correct                     MPQA: 71.23(0.011); 71.08(0.008); 71.09(0.008); 71.10(0.008); 71.14(0.009)
percent correct                      NRC: 69.72(0.013); 68.39(0.023); 69.15(0.020); 68.82(0.021); 69.62(0.012)
percent correct                    SOCAL: 78.25(0.033); 78.16(0.037); 78.13(0.038); 78


percent correct - cut (0): 75.04(0.000); cut (mean): 75.11(0.071); cut (opt): 75.17(0.046)

Correctly classified items:
Label 1 & +ve: nr 18666, mean 1.53, max 10.42, stdev 1.19, rating 8.9, length 216.42
Label 0 & -ve: nr 18856, mean -1.31, min -14.39, stdev 1.00, rating 2.1, length 237.78
Incorrectly classified items:
Label 0 but +ve: nr 6144, mean 0.73, max 5.62, stdev 0.67, rating 2.6, length 210.98
Label 1 but -ve: nr 6334, mean -0.76, min -5.07, stdev 0.67, rating 8.5, length 286.04

*** Averaging using set of means: UK ***

Averaging across lexica:
['HuLiu', 'LabMT_filtered', 'LexicoderSD', 'MPQA', 'NRC', 'SOCAL', 'SWN_filtered', 'WordStat']

percent correct - cut (0): 74.99(0.000); cut (mean): 75.09(0.087); cut (opt): 75.12(0.095)

Correctly classified items:
Label 1 & +ve: nr 18889, mean 1.28, max 8.63, stdev 0.99, rating 8.8, length 217.24
Label 0 & -ve: nr 18606, mean -1.07, min -11.84, stdev 0.83, rating 2.1, length 237.53
Incorrectly classified items:
Label 0 but +ve: nr 

### 3. Assess performance in distinguishing within positive & negative rankings

The ranking by the review writer is a value from 1 through 10. The imdb corpus includes 1-4 and 7-10. Calculate the mean and standard deviation of each ranking value. These should be distinct, if our system works well. 

#### This output is for table 5 in the paper

In [23]:
# Load data into arrays, by ranking value
valencefile = corpusfilestem + '_rankandvalence.csv'
rankingvar = 'rating'
valencevar = 'valence'

# Read into pandas dataframe (keep only columns of interest)
imdb_df = pd.read_csv(valencefile)


In [24]:
imdb_df.head()

Unnamed: 0,id,polarity,movie_id,rating,testset,valence
0,1,0,0,2,1,-0.638066
1,2,0,10000,4,1,-0.028516
2,3,0,10001,1,1,-1.777879
3,4,0,10002,3,1,-1.445579
4,5,0,10003,3,1,0.748028


In [25]:
# Averages by ranking
groupvalence = {}

for groupval, groupdata in imdb_df.groupby(rankingvar):
    print('Rating: {}; n = {}; mean valence: {:4.2f}; std.dev: {:4.2f}'.format(
        groupval, len(groupdata), groupdata['valence'].mean(), groupdata['valence'].std()))
    groupvalence[groupval] = groupdata['valence']

Rating: 1; n = 10122; mean valence: -1.11; std.dev: 1.11
Rating: 2; n = 4586; mean valence: -0.88; std.dev: 1.08
Rating: 3; n = 4961; mean valence: -0.71; std.dev: 1.04
Rating: 4; n = 5331; mean valence: -0.55; std.dev: 1.04
Rating: 7; n = 4803; mean valence: 0.27; std.dev: 1.15
Rating: 8; n = 5859; mean valence: 0.47; std.dev: 1.22
Rating: 9; n = 4607; mean valence: 0.58; std.dev: 1.23
Rating: 10; n = 9731; mean valence: 0.73; std.dev: 1.30


In [26]:
import scipy as sp

adjacentpairs = [(1,2), (2,3), (3,4), (7,8), (8,9), (9,10)]
for pair in adjacentpairs: 
    tstat, pvalue = sp.stats.ttest_ind(groupvalence[pair[0]], groupvalence[pair[1]], equal_var=False)
    print('From {} to {}: t-stat = {:5.2f}; p-value = {:.7f}'.format(pair[0], pair[1], tstat, pvalue))

From 1 to 2: t-stat = -11.87; p-value = 0.0000000
From 2 to 3: t-stat = -7.81; p-value = 0.0000000
From 3 to 4: t-stat = -7.91; p-value = 0.0000000
From 7 to 8: t-stat = -8.61; p-value = 0.0000000
From 8 to 9: t-stat = -4.67; p-value = 0.0000031
From 9 to 10: t-stat = -6.35; p-value = 0.0000000
