# Exploratory Data Analysis

In [1]:
import csv
import pandas as pd

In [2]:
path_train = 'data/sts-train.csv'
path_dev = 'data/sts-dev.csv'
path_test = 'data/sts-test.csv'

In [3]:
columns=['genre', 'file', 'year', 'index', 'score', 'sentence1', 'sentence2']

In [4]:
df_train = pd.read_csv(path_train, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_dev = pd.read_csv(path_dev, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_test = pd.read_csv(path_test, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')

In [5]:
df = pd.concat([df_train, df_dev, df_test])
df

Unnamed: 0,genre,file,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.00,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.80,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.60,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...,...,...,...,...
1374,main-news,headlines,2016,1354,0.00,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks
1375,main-news,headlines,2016,1360,1.00,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe..."
1376,main-news,headlines,2016,1368,1.00,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...
1377,main-news,headlines,2016,1420,0.00,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...


## Data cleaning

Check for the presence of null values:

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8628 entries, 0 to 1378
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   genre      8628 non-null   object 
 1   file       8628 non-null   object 
 2   year       8628 non-null   object 
 3   index      8628 non-null   int64  
 4   score      8628 non-null   float64
 5   sentence1  8628 non-null   object 
 6   sentence2  8628 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 539.2+ KB


There are no null values.

In [7]:
df['genre'].value_counts()

genre
main-news        4299
main-captions    3250
main-forums       629
main-forum        450
Name: count, dtype: int64

In [8]:
df['genre'] = df['genre'].replace('main-', '', regex=True)
df['genre'] = df['genre'].replace('forum', 'forums')

In [9]:
df['genre'].value_counts()

genre
news        4299
captions    3250
forums      1079
Name: count, dtype: int64

In [10]:
df['year'].value_counts()

year
2014         2250
2015         1875
2012test     1500
2012train    1500
2013          750
2016          503
2017          250
Name: count, dtype: int64

In [11]:
# Remove everything in this column which is not a number
df['year'] = df['year'].replace(r'\D', '', regex=True)

In [12]:
df['year'].value_counts()

year
2012    3000
2014    2250
2015    1875
2013     750
2016     503
2017     250
Name: count, dtype: int64