In [1]:
import pandas as pd
import numpy as np

In [2]:
# labels
LABEL_NOTHING = 0
LABEL_COMMA = 1
LABEL_PERIOD = 2
LABEL_QUESTION = 3

# encode the punctuation label as a number
punctEncode = {
    'O': LABEL_NOTHING,
    'COMMA': LABEL_COMMA,
    'PERIOD': LABEL_PERIOD,
    'QUESTION': LABEL_QUESTION
}

In [3]:
# Load the data from the dataset into a dataframe
def loadData(path):
    data = []
    with open(path, "rb") as file:
        for line in file:
            # dataset uses \r\n for newlines
            row = line.decode('utf-8', errors='ignore').replace('\r\n', '').split('\t')
            row[1] = punctEncode[row[1]]
            data.append(row)

    # return as dataframe
    df = pd.DataFrame(data = data, columns = ["word", "token"])
    return df

def printDataStats(df):
    print("Number of words in dataset:", len(df))
    print();
    print("Token frequency:")
    print(df.token.value_counts())
    print();
    print("Head of data:")
    display(df.head(12))


In [4]:
traindf = loadData("IWSLTche/RAW/train2012")
print("--------TRAINING DATA--------")
printDataStats(traindf)

devdf = loadData("IWSLTche/RAW/dev2012")
print("--------DEVELOPMENT DATA--------")
printDataStats(devdf)

testrefdf = loadData("IWSLTche/RAW/test2011")
print("--------TEST DATA (REF)--------")
printDataStats(testrefdf)

testasrdf = loadData("IWSLTche/RAW/test2011asr")
print("--------TEST DATA (ASR)--------")
printDataStats(testasrdf)

--------TRAINING DATA--------
Number of words in dataset: 2102417

Token frequency:
0    1801727
1     158392
2     132393
3       9905
Name: token, dtype: int64

Head of data:


Unnamed: 0,word,token
0,it,0
1,can,0
2,be,0
3,a,0
4,very,0
5,complicated,0
6,thing,1
7,the,0
8,ocean,2
9,and,0


--------DEVELOPMENT DATA--------
Number of words in dataset: 295800

Token frequency:
0    252922
1     22451
2     18910
3      1517
Name: token, dtype: int64

Head of data:


Unnamed: 0,word,token
0,adrian,0
1,kohler,1
2,well,1
3,we,0
4,'re,0
5,here,0
6,today,0
7,to,0
8,talk,0
9,about,0


--------TEST DATA (REF)--------
Number of words in dataset: 12626

Token frequency:
0    10943
1      830
2      807
3       46
Name: token, dtype: int64

Head of data:


Unnamed: 0,word,token
0,i,0
1,'m,0
2,a,0
3,savant,1
4,or,0
5,more,0
6,precisely,1
7,a,0
8,high-functioning,0
9,autistic,0


--------TEST DATA (ASR)--------
Number of words in dataset: 12822

Token frequency:
0    11180
2      809
1      798
3       35
Name: token, dtype: int64

Head of data:


Unnamed: 0,word,token
0,i,0
1,'m,0
2,as,0
3,a,0
4,font,1
5,or,0
6,more,0
7,precisely,1
8,a,0
9,high-functioning,0
