# EMO: PREPARE DATA
## The Association for Computational Linguistics
## WASSA 2023 Shared Task on Empathy Emotion and Personality Detection in Interactions
More details [here](https://codalab.lisn.upsaclay.fr/competitions/11167#learn_the_details)

In [1]:
import numpy as np
import pandas as pd
import re, os
import ftfy
import pycld2 as cld2
import time
from typing import List
from copy import deepcopy
import spacy
import pkg_resources
from symspellpy import SymSpell, Verbosity

import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

2023-04-11 17:17:03.887593: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
multi_spaces = re.compile('\s{2,}')

def clean_text(s):
    if not isinstance(s, str):
        return s
    for char in ['�', '•']:
        if char in s:
            s = s.replace(char, ' ')
    s = ftfy.fix_text(s)
    
    #s = clean.sub(' ', s.lower())
    s = multi_spaces.sub(' ', s)
        
    return s.strip()

In [3]:
def detect_lang( t ):
    '''
        Return the language(s) in string s.
        Naive Bayes classifier under the hood -
        results are less certain for strings that are too short.
        Returns up to three languages with confidence scores.
        More on usage: https://pypi.org/project/pycld2/
    '''
    _, _, details = cld2.detect( ftfy.fix_text( t ) )
    return details[0][0]

In [4]:
def get_target(emotions: List[str])->List[int]:
    '''
        Convert list of strings with categories into list of 0s and 1s with length 8 because there are 8 categories;
        1 in the i-th position means that this essay belongs to the i-th category as in key2label[i]
    '''
    res  = [0]*8
    idxs = [label2key[e] for e in emotions]    
    for idx in idxs:
        res[idx] = 1
    return res

In [5]:
# target variables
label2key = {   
    'Anger':    0,
    'Disgust':  1,
    'Fear':     2,
    'Hope':     3,    
    'Joy':      4,
    'Neutral':  5,
    'Sadness':  6,
    'Surprise': 7,
}
key2label = {v: k for k,v in label2key.items()}
print(key2label)

{0: 'Anger', 1: 'Disgust', 2: 'Fear', 3: 'Hope', 4: 'Joy', 5: 'Neutral', 6: 'Sadness', 7: 'Surprise'}


In [6]:
# new new version (Dec 2022)
def upsample_all( df_, labels_col='target', random_state=47 ):
    '''
        Upsample each class in column labels_col of pandas dataframe df_
        to the number of data points in majority class
    '''
    # get sub-dataframes for each class & max length
    labels = df_[labels_col].unique()
    dframes, df_lengths = dict(), dict()
    for i in labels:
        temp          = df_[ df_[labels_col] == i ]
        dframes[i]    = temp.copy()
        df_lengths[i] = len(temp)

    max_len = max( list(df_lengths.values()) )
    df_lengths = {k: max_len-v for k,v in df_lengths.items()}                     # difference - how many to resample

    # upsample with replacement to max length
    for i in labels:
        if df_lengths[i] == max_len:
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # we know it's overrepresented
        else:
            if len(dframes[i]) >= df_lengths[i]:
                replace = False                                                      # enough data points
            else:
                replace = True
            temp = dframes[i].sample( df_lengths[i], replace=replace, random_state=random_state )
            dframes[i] = pd.concat( [dframes[i].copy(), temp.copy()] )               # df len + (max_len-df len)
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # shuffle

    # combine and reshuffle
    df_merged = pd.concat( list(dframes.values()) )
    df_merged = df_merged.sample( frac=1, random_state=random_state ).reset_index(drop=True)

    return df_merged

In [7]:
random_state = 47

## LOAD DATA
### a) Training set

In [14]:
# project files
wdir  = 'data'
files = [ 'WASSA23_essay_level_with_labels_train.tsv', 'WASSA23_essay_level_dev.tsv',
          'WASSA23_conv_level_with_labels_train.tsv', 'WASSA23_conv_level_dev.tsv',
          'articles_adobe_AMT.csv', 'goldstandard_CONV_dev.tsv', 'goldstandard_dev.tsv', ]

# only columns of interest
# put similar columns together
cols_train = [ 'article_id', 'conversation_id', 'speaker_number', 'essay_id', 'speaker_id',
               'essay', 'essay_clean', 'split', 'gender', 'education', 'race', 'age', 'income', 'emotion',
             ]

In [18]:
df_train = pd.read_csv( os.path.join(wdir, files[0]), sep='\t' )
df_train['essay_clean'] = df_train['essay'].apply(clean_text)
df_train['emotion'] = df_train['emotion'].apply( lambda x: x.split('/'))
df_train = df_train[cols_train]

print(df_train.shape, '\n')
print(df_train.dtypes, '\n')
print(df_train.isna().sum(), 'n')

print('\nTotal essays: ', df_train.shape[0])
print('Unique essays:', len(df_train['essay_clean'].unique()))
df_train.head(25)

(792, 14) 

article_id          int64
conversation_id     int64
speaker_number      int64
essay_id            int64
speaker_id          int64
essay              object
essay_clean        object
split              object
gender             object
education          object
race               object
age                object
income             object
emotion            object
dtype: object 

article_id         0
conversation_id    0
speaker_number     0
essay_id           0
speaker_id         0
essay              0
essay_clean        0
split              0
gender             0
education          0
race               0
age                0
income             0
emotion            0
dtype: int64 n

Total essays:  792
Unique essays: 792


Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion
0,35,2,1,1,30,It breaks my heart to see people living in tho...,It breaks my heart to see people living in tho...,train,1,6,3,37,40000,"[Hope, Sadness]"
1,35,3,1,2,19,I wonder why there aren't more people trying t...,I wonder why there aren't more people trying t...,train,1,6,2,32,35000,[Anger]
2,35,5,1,4,17,"After reading the article, you can't help but ...","After reading the article, you can't help but ...",train,1,6,1,29,85000,[Sadness]
3,213,6,1,5,16,It is so sad that someone who had such an amaz...,It is so sad that someone who had such an amaz...,train,2,5,1,28,50000,[Sadness]
4,213,8,1,7,30,"From reading the article, it looks like the wo...","From reading the article, it looks like the wo...",train,1,6,3,37,40000,[Neutral]
5,213,10,1,9,49,That's sad. Regardless of what they find out ...,That's sad. Regardless of what they find out h...,train,1,5,1,31,82000,[Sadness]
6,78,11,1,10,17,"After reading the article, my reaction is that...","After reading the article, my reaction is that...",train,1,6,1,29,85000,[Sadness]
7,78,13,1,12,24,It sounds like these boys had a really rough l...,It sounds like these boys had a really rough l...,train,2,7,1,38,42000,[Sadness]
8,78,14,1,13,43,This is a tragic and sad story about how some ...,This is a tragic and sad story about how some ...,train,2,6,1,33,110000,[Sadness]
9,336,17,1,16,31,Hello. I feel really terrible about the curren...,Hello. I feel really terrible about the curren...,train,unknown,unknown,unknown,unknown,unknown,"[Disgust, Sadness]"


__Based on the above stats, there are no duplicate essays, no NaN values, but there is missing data in the form of a string instead of a numerical value - the presence of `unknown` in some numerical columns changes their type from `integer` / `float` to `object`__

In [19]:
# not exploding
df_train.explode('emotion')['emotion'].value_counts()

Sadness     383
Neutral     240
Anger       124
Disgust     100
Fear         33
Hope         32
Surprise     19
Joy          10
Name: emotion, dtype: int64

In [20]:
df_train['target_encoded'] = df_train['emotion'].apply(get_target)
df_train.head(25)

Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion,target_encoded
0,35,2,1,1,30,It breaks my heart to see people living in tho...,It breaks my heart to see people living in tho...,train,1,6,3,37,40000,"[Hope, Sadness]","[0, 0, 0, 1, 0, 0, 1, 0]"
1,35,3,1,2,19,I wonder why there aren't more people trying t...,I wonder why there aren't more people trying t...,train,1,6,2,32,35000,[Anger],"[1, 0, 0, 0, 0, 0, 0, 0]"
2,35,5,1,4,17,"After reading the article, you can't help but ...","After reading the article, you can't help but ...",train,1,6,1,29,85000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
3,213,6,1,5,16,It is so sad that someone who had such an amaz...,It is so sad that someone who had such an amaz...,train,2,5,1,28,50000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
4,213,8,1,7,30,"From reading the article, it looks like the wo...","From reading the article, it looks like the wo...",train,1,6,3,37,40000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]"
5,213,10,1,9,49,That's sad. Regardless of what they find out ...,That's sad. Regardless of what they find out h...,train,1,5,1,31,82000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
6,78,11,1,10,17,"After reading the article, my reaction is that...","After reading the article, my reaction is that...",train,1,6,1,29,85000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
7,78,13,1,12,24,It sounds like these boys had a really rough l...,It sounds like these boys had a really rough l...,train,2,7,1,38,42000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
8,78,14,1,13,43,This is a tragic and sad story about how some ...,This is a tragic and sad story about how some ...,train,2,6,1,33,110000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
9,336,17,1,16,31,Hello. I feel really terrible about the curren...,Hello. I feel really terrible about the curren...,train,unknown,unknown,unknown,unknown,unknown,"[Disgust, Sadness]","[0, 1, 0, 0, 0, 0, 1, 0]"


### b) Dev

In [21]:
# only columns of interest
cols_dev = [ 'article_id', 'conversation_id', 'speaker_number', 'essay_id', 'speaker_id',
             'essay', 'essay_clean', 'split', 'gender', 'education', 'race', 'age', 'income', ]
len(cols_dev)

13

In [22]:
df_dev = pd.read_csv( os.path.join(wdir, files[1]), sep='\t' )
df_dev['essay_clean'] = df_dev['essay'].apply(clean_text)
df_dev = df_dev[cols_dev]

print(df_dev.shape, '\n')
print(df_dev.dtypes, '\n')
print(df_dev.isna().sum(), 'n')

print('\nTotal essays: ', df_dev.shape[0])
print('Unique essays:', len(df_dev['essay_clean'].unique()))

df_dev.head(25)

(208, 13) 

article_id          int64
conversation_id     int64
speaker_number      int64
essay_id            int64
speaker_id          int64
essay              object
essay_clean        object
split              object
gender              int64
education           int64
race                int64
age                 int64
income              int64
dtype: object 

article_id         0
conversation_id    0
speaker_number     0
essay_id           0
speaker_id         0
essay              0
essay_clean        0
split              0
gender             0
education          0
race               0
age                0
income             0
dtype: int64 n

Total essays:  208
Unique essays: 208


Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income
0,35,1,1,0,68,How sad is it that this kind of pain and suffe...,How sad is it that this kind of pain and suffe...,dev,2,2,1,21,20000
1,35,4,1,3,79,The article is kind of tragic and hits close t...,The article is kind of tragic and hits close t...,dev,1,6,3,33,64000
2,213,7,1,6,68,"I think that these kinds of stories, are sad, ...","I think that these kinds of stories, are sad, ...",dev,2,2,1,21,20000
3,213,9,1,8,84,It's crazy that random accidents like this hap...,It's crazy that random accidents like this hap...,dev,2,4,1,25,55000
4,78,12,1,11,68,This story makes me so so sad.... As someone w...,This story makes me so so sad.... As someone w...,dev,2,2,1,21,20000
5,78,15,1,14,70,"After reading the article, my first reaction a...","After reading the article, my first reaction a...",dev,1,6,1,29,85000
6,336,16,1,15,81,I didn't know coal mining had such adverse eff...,I didn't know coal mining had such adverse eff...,dev,1,4,1,30,27000
7,336,20,1,19,73,This is very sad. I can't imagine having elep...,This is very sad. I can't imagine having eleph...,dev,2,7,1,38,42000
8,281,23,1,22,63,"Guys, reading this article really hits home fo...","Guys, reading this article really hits home fo...",dev,1,4,1,25,29000
9,171,26,1,25,63,Hey guys. So I just read this article about Ir...,Hey guys. So I just read this article about Ir...,dev,1,4,1,25,29000


__No duplicate essays or missing / NaN values__

### c) Gold

In [23]:
colnames=[ 'empathy', 'distress', 'emotion', 'personality_conscientiousness', 'personality_openess',
           'personality_extraversion', 'personality_agreeableness',
           'personality_stability', 'iri_perspective_taking',
           'iri_personal_distress', 'iri_fantasy', 'iri_empathatic_concern', ] 
print(len(colnames))
df_gold = pd.read_csv( os.path.join(wdir, files[6]), names=colnames, sep='\t')
print(df_dev.shape, '\n')
print(df_gold.shape, '\n')
print(df_gold.isna().sum(), '\n')
print(df_gold.dtypes, '\n')
df_gold.head(25)

12
(208, 13) 

(208, 12) 

empathy                          0
distress                         0
emotion                          0
personality_conscientiousness    0
personality_openess              0
personality_extraversion         0
personality_agreeableness        0
personality_stability            0
iri_perspective_taking           0
iri_personal_distress            0
iri_fantasy                      0
iri_empathatic_concern           0
dtype: int64 

empathy                          float64
distress                         float64
emotion                           object
personality_conscientiousness    float64
personality_openess              float64
personality_extraversion         float64
personality_agreeableness        float64
personality_stability            float64
iri_perspective_taking           float64
iri_personal_distress            float64
iri_fantasy                      float64
iri_empathatic_concern           float64
dtype: object 



Unnamed: 0,empathy,distress,emotion,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern
0,3.833333,3.375,Sadness,5.0,3.0,5.0,4.0,3.5,2.714,3.0,3.143,3.286
1,3.0,1.0,Sadness,6.5,7.0,3.5,4.5,7.0,3.714,1.0,2.429,1.429
2,3.833333,4.25,Sadness,5.0,3.0,5.0,4.0,3.5,2.714,3.0,3.143,3.286
3,3.166667,2.375,Neutral,5.5,5.5,3.5,4.5,4.0,3.571,2.857,3.571,3.143
4,3.333333,3.5,Sadness,5.0,3.0,5.0,4.0,3.5,2.714,3.0,3.143,3.286
5,1.5,1.5,Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643
6,6.0,6.0,Neutral,3.0,4.0,6.0,6.0,6.5,3.429,2.857,4.571,4.0
7,2.5,1.0,Sadness,7.0,3.5,6.5,5.5,6.5,3.429,2.714,2.571,3.857
8,4.0,5.5,Sadness,6.0,6.0,5.5,6.5,3.0,4.857,3.143,2.571,4.857
9,3.666667,2.25,Neutral,6.0,6.0,5.5,6.5,3.0,4.857,3.143,2.571,4.857


### d) Merge dev and gold and check for data leakage

In [24]:
df_dev['emotion'] = df_gold['emotion'].values
df_dev['emotion'] = df_dev['emotion'].apply( lambda x: x.split('/'))

# not exploding
df_dev.explode('emotion')['emotion'].value_counts()

Sadness     101
Neutral      54
Anger        38
Disgust      24
Hope         16
Fear          8
Surprise      3
Joy           2
Name: emotion, dtype: int64

In [25]:
# any overlap with training set (data leakage)?
overlap = list(set([ i for i in df_train['essay_clean'].values if i in df_dev['essay_clean'].values ]))
df_train[ df_train['essay_clean'].isin(overlap) ]

Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion,target_encoded
746,7,443,2,942,30,Living in a war tore country must be horrible....,Living in a war tore country must be horrible....,train,1,6,3,37,40000,"[Hope, Sadness]","[0, 0, 0, 1, 0, 0, 1, 0]"


In [26]:
df_dev[ df_dev['essay_clean'].isin(overlap) ]

Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion
197,11,449,2,948,75,Living in a war tore country must be horrible....,Living in a war tore country must be horrible....,dev,1,6,3,37,40000,[Sadness]


* Exactly the same essay text
* Different article and essay id
* Different set of labels (one coincides)
* This can definitely cause [small] confusion
* Best solution - there is an element of hope in the train (underreprented category) + the dev set is used to submit the dev results and compare with other particiapnts - keep the overlap in both sets

### e) Prepare target in dev set for classification report

In [27]:
df_dev['target_encoded'] = df_dev['emotion'].apply(get_target)
df_dev.head(25)

Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion,target_encoded
0,35,1,1,0,68,How sad is it that this kind of pain and suffe...,How sad is it that this kind of pain and suffe...,dev,2,2,1,21,20000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
1,35,4,1,3,79,The article is kind of tragic and hits close t...,The article is kind of tragic and hits close t...,dev,1,6,3,33,64000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
2,213,7,1,6,68,"I think that these kinds of stories, are sad, ...","I think that these kinds of stories, are sad, ...",dev,2,2,1,21,20000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
3,213,9,1,8,84,It's crazy that random accidents like this hap...,It's crazy that random accidents like this hap...,dev,2,4,1,25,55000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]"
4,78,12,1,11,68,This story makes me so so sad.... As someone w...,This story makes me so so sad.... As someone w...,dev,2,2,1,21,20000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
5,78,15,1,14,70,"After reading the article, my first reaction a...","After reading the article, my first reaction a...",dev,1,6,1,29,85000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
6,336,16,1,15,81,I didn't know coal mining had such adverse eff...,I didn't know coal mining had such adverse eff...,dev,1,4,1,30,27000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]"
7,336,20,1,19,73,This is very sad. I can't imagine having elep...,This is very sad. I can't imagine having eleph...,dev,2,7,1,38,42000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
8,281,23,1,22,63,"Guys, reading this article really hits home fo...","Guys, reading this article really hits home fo...",dev,1,4,1,25,29000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]"
9,171,26,1,25,63,Hey guys. So I just read this article about Ir...,Hey guys. So I just read this article about Ir...,dev,1,4,1,25,29000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]"


## Add articles, do speecheck

In [28]:
# open the articles file
df_art = pd.read_csv( os.path.join(wdir, files[4]), )
print(df_art.shape, '\n')
print('Unique article IDs:', len(df_art['article_id'].unique()), '\n')
print(df_art.isna().sum(), '\n')
print(df_art.dtypes, '\n')
df_art.head(25)

(418, 2) 

Unique article IDs: 418 

article_id    0
text          0
dtype: int64 

article_id     int64
text          object
dtype: object 



Unnamed: 0,article_id,text
0,1,'Abhorrent' bottle attack on young Rangers fan...
1,2,'Afghan Girl' in iconic National Geographic ph...
2,3,'My whole family has been wiped out': Victims ...
3,4,'RHONY' STAR JULES WAINSTEIN Estranged Husband...
4,5,'Swam for their life': More survivors of Levia...
5,6,"'They Were Just Like Us, and They Lost Everyth..."
6,7,"'This isn't Pompeii, this is Aleppo' — As 200..."
7,8,'This was a sad but avoidable tragedy'; kennel...
8,9,"'Utter devastation' after major quake, aftersh..."
9,10,"1 Police Officer Dead, 1 Wounded After Western..."


In [29]:
def get_article(article_id):
    return df_art[ df_art['article_id']==article_id ]['text'].values[0]

df_train['article']       = df_train['article_id'].apply( get_article )
df_train['article_clean'] = df_train['article'].apply( clean_text )

df_dev['article']         = df_dev['article_id'].apply( get_article )
df_dev['article_clean']   = df_dev['article'].apply( clean_text )

In [30]:
def autocorrect(docs):
    '''
        Apply symspellpy and keep word case and punctuation    
    '''
    for doc in docs:
        res = ''
        for t in doc:
            if t.is_alpha and t.is_oov:
                suggestions = sym_spell.lookup( t.text, Verbosity.TOP, max_edit_distance=2,
                                                include_unknown=True,
                                                transfer_casing=True,)
                corrected_word = suggestions[0].term
                if t.text[0].islower():
                    res += corrected_word
                else:
                    res += corrected_word.capitalize()
                res += t.whitespace_

            else:
                res += t.text_with_ws

        yield res.strip()
        

# load spaCy        
nlp = spacy.load("en_core_web_lg", exclude=['parser', 'ner', 'tagger', 'attribute_ruler', 'lemmatizer'])
print(nlp.pipe_names)

# load symspellpy
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt" )
# term_index - column of term; count_index - column of term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
print('Total unique English words:', len(sym_spell.words.keys()))    # this is already a dict: keys=words, values=freqs

# sample usage of symspellpy
input_term = "memebers" 
# max_edit_distance_lookup <= max_dictionary_edit_distance
suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=2)
print(input_term)
for suggestion in suggestions:
    print(suggestion)

['tok2vec']
Total unique English words: 82834
memebers
members, 1, 226656153


In [31]:
df_train['essay_clean']   = df_train['essay_clean'].str.strip()
df_train['article_clean'] = df_train['article_clean'].str.strip()
df_dev['essay_clean']     = df_dev['essay_clean'].str.strip()
df_dev['article_clean']   = df_dev['article_clean'].str.strip()

In [32]:
def get_var_name(obj):
    '''
        Get the name of variable obj    
    '''
    name =[x for x in globals() if globals()[x] is obj][0]
    return name

cols = [ 'essay_clean', 'article_clean' ]
dfs  = [df_train, df_dev]

for df in dfs:
    for col in cols:
        print(f'Processing dataframe {get_var_name(df)}, column {col}')
        df[col + '_docs'] = list(nlp.pipe( df[col].tolist() ))
        df[col + '_spellchecked'] = list(autocorrect( df[col + '_docs'].tolist() ))

Processing dataframe df_train, column essay_clean
Processing dataframe df_train, column article_clean
Processing dataframe df_dev, column essay_clean
Processing dataframe df_dev, column article_clean


In [33]:
df_train['compare1'] = df_train['essay_clean'] == df_train['essay_clean_spellchecked']
print('df_train, essays:\n', df_train['compare1'].value_counts(), sep='')
df_train['compare2'] = df_train['article_clean'] == df_train['article_clean_spellchecked']
print('\ndf_train, articles:\n', df_train['compare2'].value_counts(), sep='')

df_train, essays:
True     690
False    102
Name: compare1, dtype: int64

df_train, articles:
False    436
True     356
Name: compare2, dtype: int64


In [34]:
df_dev['compare1'] = df_dev['essay_clean'] == df_dev['essay_clean_spellchecked']
print('df_dev, essays:\n', df_dev['compare1'].value_counts(), sep='')
df_dev['compare2'] = df_dev['article_clean'] == df_dev['article_clean_spellchecked']
print('\ndf_dev, articles:\n', df_dev['compare2'].value_counts(), sep='')

df_dev, essays:
True     171
False     37
Name: compare1, dtype: int64

df_dev, articles:
False    124
True      84
Name: compare2, dtype: int64


In [35]:
import difflib

col = 'compare2'
df_temp = df_dev[ df_dev[col]==False ]
print(df_temp.shape)
for j, k, l, in df_temp[[ 'article_clean', 'article_clean_spellchecked',
                                    col, ]].values:
    print(f'ORIGINAL:\n{j}\n')
    print(f'SPELLCHECKED:\n{k}\n')
    print(f'IS SAME:\n{l}\n')
    
    cases = [('ORIGINAL', 'SPELLCHECKED', j,k),]
    for a,b, c, d in cases:     
        print(f'{a} => {b}')
        print(f'{a} == {b}: {c.strip()==d.strip()}')
        for i,s in enumerate(difflib.ndiff(c, d)):
            if s[0]==' ': continue
            elif s[0]=='-':
                print(u'Delete "{}" from position {}'.format(s[-1],i))
            elif s[0]=='+':
                print(u'Add "{}" to position {}'.format(s[-1],i))    
    print()
        
    print('\n', '='*77, '\n', sep='')

(124, 23)
ORIGINAL:
A month after Hurricane Matthew, 800,000 Haitians urgently need food — FONDTOUTANU, Haiti — There is no food, so along the road through the mountains there are children begging for something to eat. Most of the trucks rumble past with donations for somewhere else. But one stopped here the other day with sacks of rice, beans and dried herring, setting off a stampede. Valleur Noel, a trim, short man with a checkered shirt and a shiny crucifix, climbed to the top of the tailgate and told everyone to calm down. It was futile. His organization, Pwoje Men Kontre, had 412 bags of food, a gift from the German ambassador and U.S. donors. Within minutes there were people pouring through a notch between the mountains, hollering and stumbling down the rocky hillside toward the truck. "No pushing, no pushing!" Noel yelled. "There is enough for everyone!" It wasn't true. The latecomers got nothing. But many others did, and Figaro Phito, 29, hugged his sack with both arms, like a 

Delete "z" from position 594
Add "t" to position 595
Delete "z" from position 597
Add "n" to position 598
Delete "z" from position 1407
Add "t" to position 1408
Delete "z" from position 1410
Add "n" to position 1411
Delete "w" from position 2251
Add "n" to position 2252
Delete "s" from position 2377
Delete "w" from position 2479
Add "n" to position 2480
Delete "w" from position 3088
Add "n" to position 3089
Delete "w" from position 3358
Add "n" to position 3359
Delete "u" from position 4652
Add "a" to position 4653
Delete "w" from position 4949
Add "n" to position 4950
Delete "w" from position 5192
Add "n" to position 5193
Delete "z" from position 8615
Add "t" to position 8616
Delete "z" from position 8618
Add "n" to position 8619
Delete "r" from position 8977
Add "l" to position 8978
Delete "u" from position 9741
Add "a" to position 9742



ORIGINAL:
At least 239 migrants believed drowned in Mediterranean, U.N. says — BRUSSELS — At least 239 migrants are believed to have drowned this 

Delete "h" from position 2720
Delete "l" from position 2726
Delete "h" from position 3332
Delete "l" from position 3338
Delete "t" from position 3789
Delete "z" from position 3790
Add "n" to position 3791
Add "a" to position 3792
Delete "L" from position 8142
Add "V" to position 8143
Delete "n" from position 8149
Delete "h" from position 8500
Delete "l" from position 8506
Delete "L" from position 8961
Add "V" to position 8962
Delete "n" from position 8968



ORIGINAL:
Burma's Aung San Suu Kyi under fire as alleged military abuse follows militant attack — SITTWE, Burma — A security crackdown following militant attacks has exacerbated the humanitarian situation in a predominantly Muslim region of Burma and focused international attention on the new government of Aung San Suu Kyi. Burmese troops launched a wide-ranging manhunt last month in a troubled area of northern Rakhine state populated largely by Rohingya Muslims, leaving scorched homes and displaced residents in their wake. The man

Delete "h" from position 5068
Delete "t" from position 5072
Delete "e" from position 8900
Add "i" to position 8901
Delete "o" from position 8903
Add "y" to position 8904



ORIGINAL:

SPELLCHECKED:

IS SAME:
False

ORIGINAL => SPELLCHECKED
ORIGINAL == SPELLCHECKED: False
Delete "h" from position 5068
Delete "t" from position 5072
Delete "e" from position 8900
Add "i" to position 8901
Delete "o" from position 8903
Add "y" to position 8904



ORIGINAL:

SPELLCHECKED:

IS SAME:
False

ORIGINAL => SPELLCHECKED
ORIGINAL == SPELLCHECKED: False
Delete "h" from position 5068
Delete "t" from position 5072
Delete "e" from position 8900
Add "i" to position 8901
Delete "o" from position 8903
Add "y" to position 8904



ORIGINAL:
TRAGEDY: Tampa man lost wife, 2 children, in horrific crash that killed 5 — TAMPA, Fla. (WFLA) — The heartache of losing a loved one in a sudden, unexpected moment is devastating. The shock is jarring and confusing. Families and friends are left with broken hearts and achi

Delete "C" from position 1071
Delete "I" from position 1072
Delete "E" from position 1073
Delete "H" from position 1074
Add "W" to position 1075
Add "i" to position 1076
Add "t" to position 1077
Add "h" to position 1078
Delete "C" from position 1435
Delete "I" from position 1436
Delete "E" from position 1437
Delete "H" from position 1438
Add "W" to position 1439
Add "i" to position 1440
Add "t" to position 1441
Add "h" to position 1442
Delete "g" from position 12545
Delete "a" from position 12549
Delete "g" from position 12843
Delete "a" from position 12847
Delete "i" from position 14364
Delete "n" from position 14365
Delete "I" from position 15679
Delete "T" from position 15680
Delete "S" from position 15681
Delete "A" from position 15682
Delete "N" from position 15683
Delete "T" from position 15684
Delete "O" from position 15685
Delete "N" from position 15686
Delete "I" from position 15687
Delete "S" from position 15688
Add "i" to position 15689
Add "t" to position 15690
Add "s" to p

In [36]:
file = 'data/df_train.pkl'
df_train.to_pickle(file)

In [38]:
file = 'data/df_dev.pkl'
df_dev.to_pickle(file)