# Exploratory Data Analysis

In [2]:
import csv
import pandas as pd

In [3]:
path_train = 'data/sts-train.csv'
path_dev = 'data/sts-dev.csv'
path_test = 'data/sts-test.csv'

In [4]:
columns=['genre', 'file', 'year', 'index', 'score', 'sentence1', 'sentence2']

In [5]:
df_train = pd.read_csv(path_train, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_dev = pd.read_csv(path_dev, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_test = pd.read_csv(path_test, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')

In [6]:
df = pd.concat([df_train, df_dev, df_test])
df

Unnamed: 0,genre,file,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.00,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.80,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.60,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...,...,...,...,...
1374,main-news,headlines,2016,1354,0.00,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks
1375,main-news,headlines,2016,1360,1.00,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe..."
1376,main-news,headlines,2016,1368,1.00,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...
1377,main-news,headlines,2016,1420,0.00,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...


## Data cleaning

Check for the presence of null values:

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8628 entries, 0 to 1378
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   genre      8628 non-null   object 
 1   file       8628 non-null   object 
 2   year       8628 non-null   object 
 3   index      8628 non-null   int64  
 4   score      8628 non-null   float64
 5   sentence1  8628 non-null   object 
 6   sentence2  8628 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 539.2+ KB


There are no null values.

In [8]:
df['genre'].value_counts()

genre
main-news        4299
main-captions    3250
main-forums       629
main-forum        450
Name: count, dtype: int64

In [9]:
df['genre'] = df['genre'].replace('main-', '', regex=True)
df['genre'] = df['genre'].replace('forum', 'forums')

In [10]:
df['genre'].value_counts()

genre
news        4299
captions    3250
forums      1079
Name: count, dtype: int64

In [11]:
df['year'].value_counts()

year
2014         2250
2015         1875
2012test     1500
2012train    1500
2013          750
2016          503
2017          250
Name: count, dtype: int64

In [12]:
# Remove everything in this column which is not a number
df['year'] = df['year'].replace(r'\D', '', regex=True)

In [13]:
df['year'].value_counts()

year
2012    3000
2014    2250
2015    1875
2013     750
2016     503
2017     250
Name: count, dtype: int64

## Dataset Analysis

In [48]:
df

Unnamed: 0,genre,file,year,index,score,sentence1,sentence2
0,captions,MSRvid,2012,1,5.00,A plane is taking off.,An air plane is taking off.
1,captions,MSRvid,2012,4,3.80,A man is playing a large flute.,A man is playing a flute.
2,captions,MSRvid,2012,5,3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,captions,MSRvid,2012,6,2.60,Three men are playing chess.,Two men are playing chess.
4,captions,MSRvid,2012,9,4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...,...,...,...,...
1374,news,headlines,2016,1354,0.00,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks
1375,news,headlines,2016,1360,1.00,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe..."
1376,news,headlines,2016,1368,1.00,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...
1377,news,headlines,2016,1420,0.00,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...


In [49]:
print(f'Total number of sentences: {df.shape[0]*2}')

Total number of sentences: 17256


Lengths of sentences (n° characters):

In [51]:
s1_len = df['sentence1'].apply(lambda x : len(str(x)))
s2_len = df['sentence2'].apply(lambda x : len(str(x)))

print(f'Lengths of sentences 1: \n{s1_len}')
print(f'Lengths of sentences 2: \n{s2_len}')

Lengths of sentences 1: 
0       22
1       31
2       45
3       28
4       27
        ..
1374    53
1375    50
1376    42
1377    58
1378    41
Name: sentence1, Length: 8628, dtype: int64
Lengths of sentences 2: 
0       27
1       25
2       56
3       26
4       34
        ..
1374    39
1375    58
1376    59
1377    58
1378    56
Name: sentence2, Length: 8628, dtype: int64


Average length of sentences:

In [52]:
avg_s1_len = s1_len.mean()
avg_s2_len = s2_len.mean()

print(f'Avg length of sentence 1: {avg_s1_len}')
print(f'Avg length of sentence 2: {avg_s2_len}')

Avg length of sentence 1: 58.23528048215113
Avg length of sentence 2: 57.9598980064905


Processing sentences to extract vocabulary:

In [24]:
df_processed = pd.DataFrame(columns=['sentence1', 'sentence2', 'vocab1', 'vocab2'])

# lowercase
df_processed['sentence1'] = df['sentence1'].apply(lambda x : x.lower())
df_processed['sentence2'] = df['sentence2'].apply(lambda x : x.lower())

df_processed['vocab1'] = df_processed['sentence1'].apply(lambda x: sorted(str(x).split()))
df_processed['vocab2'] = df_processed['sentence2'].apply(lambda x: sorted(str(x).split()))

In [25]:
df_processed

Unnamed: 0,sentence1,sentence2,vocab1,vocab2
0,a plane is taking off.,an air plane is taking off.,"[a, is, off., plane, taking]","[air, an, is, off., plane, taking]"
1,a man is playing a large flute.,a man is playing a flute.,"[a, a, flute., is, large, man, playing]","[a, a, flute., is, man, playing]"
2,a man is spreading shreded cheese on a pizza.,a man is spreading shredded cheese on an uncoo...,"[a, a, cheese, is, man, on, pizza., shreded, s...","[a, an, cheese, is, man, on, pizza., shredded,..."
3,three men are playing chess.,two men are playing chess.,"[are, chess., men, playing, three]","[are, chess., men, playing, two]"
4,a man is playing the cello.,a man seated is playing the cello.,"[a, cello., is, man, playing, the]","[a, cello., is, man, playing, seated, the]"
...,...,...,...,...
1374,"philippines, canada pledge to further boost re...",philippines saves 100 after ferry sinks,"[boost, canada, further, philippines,, pledge,...","[100, after, ferry, philippines, saves, sinks]"
1375,israel bars palestinians from jerusalem's old ...,"two-state solution between palestinians, israe...","[bars, city, from, israel, jerusalem's, old, p...","[between, in, israel, palestinians,, pie, sky,..."
1376,how much do you know about secret service?,lawmakers from both sides express outrage at s...,"[about, do, how, know, much, secret, service?,...","[at, both, express, from, lawmakers, outrage, ..."
1377,obama struggles to soothe saudi fears as iran ...,myanmar struggles to finalize voter lists for ...,"[as, fears, iran, obama, resume, saudi, soothe...","[finalize, for, lists, myanmar, polls, struggl..."


Average n° of words per sentence:

In [26]:
avg_vocab1_size = df_processed['vocab1'].apply(lambda x: len(x)).mean()
avg_vocab2_size = df_processed['vocab2'].apply(lambda x: len(x)).mean()

print(f'Avg length of vocab 1: {avg_vocab1_size}')
print(f'Avg length of vocab 2: {avg_vocab2_size}')

Avg length of vocab 1: 10.183704218822438
Avg length of vocab 2: 10.159828465461288


Average n° of words per sentence without considering duplicates:

In [53]:
avg_vocab1 = df_processed['vocab1'].apply(lambda x: len(set(x))).mean()
avg_vocab2 = df_processed['vocab2'].apply(lambda x: len(set(x))).mean()

print(f'Avg length of vocab 1 (without duplicates): {avg_vocab1}')
print(f'Avg length of vocab 2 (without duplicates): {avg_vocab2}')

Avg length of vocab 1 (without duplicates): 9.59121464997682
Avg length of vocab 2 (without duplicates): 9.568382012053778


Vocabulary analysis, removing punctuation:

In [54]:
import string
import re

regex = '[' + string.punctuation + ']'
print(regex)

[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]


In [55]:
df_processed['vocab1'] = df_processed['sentence1'].apply(lambda x: sorted(str(re.sub(regex, '', x)).split()))
df_processed['vocab2'] = df_processed['sentence2'].apply(lambda x: sorted(str(re.sub(regex, '', x)).split()))

In [56]:
df_processed['vocab1'].apply(lambda x: len(x)).mean() # avg n° of words per sentence1

10.163885025498377

In [57]:
df_processed['vocab2'].apply(lambda x: len(x)).mean() # avg n° of words per sentence2

10.142095503013445

Analysis of the Vocabulary of all sentences 1 and 2:

In [37]:
# entire vocabulary of sentences 1
global_vocab1 = df_processed['vocab1'].sum()
print(global_vocab1[:20])
print(f'Vocab 1 size: {len(global_vocab1)}')

['a', 'is', 'off', 'plane', 'taking', 'a', 'a', 'flute', 'is', 'large', 'man', 'playing', 'a', 'a', 'cheese', 'is', 'man', 'on', 'pizza', 'shreded']
Vocab 1 size: 87694


In [58]:
# entire vocabulary of sentences 2
global_vocab2 = df_processed['vocab2'].sum()
print(global_vocab2[:20])
print(f'Vocab 2 size: {len(global_vocab2)}')

['air', 'an', 'is', 'off', 'plane', 'taking', 'a', 'a', 'flute', 'is', 'man', 'playing', 'a', 'an', 'cheese', 'is', 'man', 'on', 'pizza', 'shredded']
Vocab 2 size: 87506


In [59]:
import nltk
counts1 = nltk.Counter(global_vocab1)
print(counts1)

counts2 = nltk.Counter(global_vocab2)
print(counts2)



In [60]:
counts1.most_common(30)

[('a', 5283),
 ('the', 3949),
 ('in', 2446),
 ('is', 2037),
 ('to', 1778),
 ('of', 1625),
 ('and', 1289),
 ('on', 1233),
 ('man', 852),
 ('for', 655),
 ('with', 650),
 ('at', 579),
 ('woman', 525),
 ('are', 516),
 ('two', 415),
 ('that', 405),
 ('you', 354),
 ('playing', 345),
 ('it', 345),
 ('an', 338),
 ('said', 338),
 ('dog', 335),
 ('was', 314),
 ('as', 313),
 ('from', 311),
 ('us', 282),
 ('by', 279),
 ('white', 263),
 ('not', 258),
 ('i', 253)]

In [41]:
counts2.most_common(30)

[('a', 5533),
 ('the', 3848),
 ('in', 2567),
 ('is', 2076),
 ('to', 1787),
 ('of', 1570),
 ('and', 1292),
 ('on', 1242),
 ('man', 876),
 ('with', 667),
 ('for', 643),
 ('at', 592),
 ('woman', 538),
 ('are', 519),
 ('two', 405),
 ('that', 381),
 ('it', 365),
 ('dog', 360),
 ('said', 355),
 ('you', 329),
 ('playing', 315),
 ('was', 312),
 ('as', 306),
 ('an', 295),
 ('from', 294),
 ('by', 281),
 ('us', 281),
 ('white', 276),
 ('i', 272),
 ('black', 238)]

Removing stopwords:

In [43]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# vocab 1 without stopwords
vocab1_nostop = [w for w in global_vocab1 if w not in stopwords.words('english')]
counts1_nostop = nltk.Counter(vocab1_nostop)

In [45]:
# vocab 2 without stopwords
vocab2_nostop = [w for w in global_vocab2 if w not in stopwords.words('english')]
counts2_nostop = nltk.Counter(vocab2_nostop)

In [46]:
counts1_nostop.most_common(30)

[('man', 852),
 ('woman', 525),
 ('two', 415),
 ('playing', 345),
 ('said', 338),
 ('dog', 335),
 ('us', 282),
 ('white', 263),
 ('black', 240),
 ('people', 216),
 ('percent', 197),
 ('killed', 181),
 ('new', 180),
 ('water', 147),
 ('girl', 143),
 ('syria', 143),
 ('running', 139),
 ('person', 138),
 ('one', 135),
 ('boy', 134),
 ('three', 124),
 ('sitting', 124),
 ('brown', 122),
 ('standing', 118),
 ('china', 118),
 ('riding', 117),
 ('police', 117),
 ('guitar', 116),
 ('red', 116),
 ('cat', 109)]

In [47]:
counts2_nostop.most_common(30)

[('man', 876),
 ('woman', 538),
 ('two', 405),
 ('dog', 360),
 ('said', 355),
 ('playing', 315),
 ('us', 281),
 ('white', 276),
 ('black', 238),
 ('people', 197),
 ('percent', 192),
 ('killed', 180),
 ('new', 169),
 ('water', 141),
 ('syria', 136),
 ('girl', 135),
 ('running', 134),
 ('boy', 133),
 ('riding', 129),
 ('standing', 126),
 ('sitting', 123),
 ('china', 123),
 ('person', 119),
 ('guitar', 116),
 ('red', 116),
 ('police', 115),
 ('cat', 112),
 ('dogs', 111),
 ('horse', 109),
 ('one', 109)]

Vocabulary of entire dataset:

In [69]:
global_vocab = global_vocab1 + global_vocab2
print(f'Vocabulary size: {len(global_vocab)}')

Vocabulary size: 175200


In [70]:
counts = nltk.Counter(global_vocab)

In [71]:
counts.most_common(30)

[('a', 10816),
 ('the', 7797),
 ('in', 5013),
 ('is', 4113),
 ('to', 3565),
 ('of', 3195),
 ('and', 2581),
 ('on', 2475),
 ('man', 1728),
 ('with', 1317),
 ('for', 1298),
 ('at', 1171),
 ('woman', 1063),
 ('are', 1035),
 ('two', 820),
 ('that', 786),
 ('it', 710),
 ('dog', 695),
 ('said', 693),
 ('you', 683),
 ('playing', 660),
 ('an', 633),
 ('was', 626),
 ('as', 619),
 ('from', 605),
 ('us', 563),
 ('by', 560),
 ('white', 539),
 ('i', 525),
 ('not', 494)]

In [72]:
vocab_nostopwords = [w for w in global_vocab if w not in stopwords.words('english')]
counts_nostopwords = nltk.Counter(vocab_nostopwords)

In [73]:
counts_nostopwords.most_common(30)

[('man', 1728),
 ('woman', 1063),
 ('two', 820),
 ('dog', 695),
 ('said', 693),
 ('playing', 660),
 ('us', 563),
 ('white', 539),
 ('black', 478),
 ('people', 413),
 ('percent', 389),
 ('killed', 361),
 ('new', 349),
 ('water', 288),
 ('syria', 279),
 ('girl', 278),
 ('running', 273),
 ('boy', 267),
 ('person', 257),
 ('sitting', 247),
 ('riding', 246),
 ('one', 244),
 ('standing', 244),
 ('china', 241),
 ('guitar', 232),
 ('red', 232),
 ('police', 232),
 ('three', 229),
 ('brown', 227),
 ('cat', 221)]