# Imports

In [1]:
import nbimporter
import Functions
import numpy as np
import pandas as pd
import re

Importing Jupyter notebook from Functions.ipynb


In [2]:
data = pd.read_csv('bert_form_data.tsv', sep='\t')
data.head()

Unnamed: 0,id,label,alpha,text
0,0,3,a,all right jim . your quarterlies look very goo...
1,1,2,a,"oh , i told you . i couldn't close it . so . . ."
2,2,3,a,so you've come to the master for guidance ? is...
3,3,2,a,"actually , you called me in here , but yeah ."
4,4,3,a,"all right . well , let me show you how it's do..."


# Convert sentences to GloVe

In [3]:
glove = Functions.loadGloveModel('glove.6B.50d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [4]:
glovesentences = []
text = data['text']
for line in text:
    glovewords = []
    line = line.lower()
    for word in line.split():
        try:
            rep = glove[word]
        except:
            rep = [0] * 50
        glovewords.append(rep)
    glovesentences.append(glovewords)

In [5]:
data['glove'] = glovesentences
hotmatrix = pd.get_dummies(data['label'])
cleaned = data[['label', 'text', 'glove']]
cleaned = pd.concat([cleaned.reset_index(drop=True),hotmatrix.reset_index(drop=True)], axis=1)
cleaned.head()

Unnamed: 0,label,text,glove,0,1,2,3,4
0,3,all right jim . your quarterlies look very goo...,"[[0.19253, 0.10006, 0.063798, -0.087664, 0.522...",0,0,0,1,0
1,2,"oh , i told you . i couldn't close it . so . . .","[[-0.070292, 1.6078, 0.64854, -0.4591, -0.1615...",0,0,1,0,0
2,3,so you've come to the master for guidance ? is...,"[[0.60308, -0.32024, 0.088857, -0.55176, 0.531...",0,0,0,1,0
3,2,"actually , you called me in here , but yeah .","[[0.42079, -0.12738, 0.36681, -0.57293, 0.7336...",0,0,1,0,0
4,3,"all right . well , let me show you how it's do...","[[0.19253, 0.10006, 0.063798, -0.087664, 0.522...",0,0,0,1,0


# Split train and test sets

In [6]:
andy = cleaned.loc[cleaned['label'] == 0]
dwight = cleaned.loc[cleaned['label'] == 1]
jim = cleaned.loc[cleaned['label'] == 2]
michael = cleaned.loc[cleaned['label'] == 3]
pam = cleaned.loc[cleaned['label'] == 4]
print("Andy size: " + str(len(andy)))
print("Dwight size: " + str(len(dwight)))
print("Jim size: " + str(len(jim)))
print("Michael size: " + str(len(michael)))
print("Pam size: " + str(len(pam)))
print("total size: " + str(len(andy) + len(dwight) + len(jim) + len(michael) + len(pam)))

Andy size: 3897
Dwight size: 7030
Jim size: 6531
Michael size: 11179
Pam size: 5155
total size: 33792


In [7]:
msk = np.random.rand(len(andy)) < 0.8
atrain = andy[msk]
atest = andy[~msk]

msk = np.random.rand(len(dwight)) < 0.8
dtrain = dwight[msk]
dtest = dwight[~msk]

msk = np.random.rand(len(jim)) < 0.8
jtrain = jim[msk]
jtest = jim[~msk]

msk = np.random.rand(len(michael)) < 0.8
mtrain = michael[msk]
mtest = michael[~msk]

msk = np.random.rand(len(pam)) < 0.8
ptrain = pam[msk]
ptest = pam[~msk]

In [8]:
testframes = [atest, dtest, jtest, mtest, ptest]
test = pd.concat(testframes)

trainframes = [atrain, dtrain, jtrain, mtrain, ptrain]
train = pd.concat(trainframes)

print(str(len(test) + len(train)))

33792


In [9]:
test.head()

Unnamed: 0,label,text,glove,0,1,2,3,4
5240,0,"hey , big tuna . you ever google google ? what...","[[-0.7001, 0.36781, 0.34424, -0.42318, -0.0460...",1,0,0,0,0
5544,0,are you playing for the other team ? !,"[[0.96193, 0.012516, 0.21733, -0.06539, 0.2684...",1,0,0,0,0
5546,0,saboteur ! i'm going to kill you for real . th...,"[[0.54062, -0.73204, -0.055612, -0.76395, 0.45...",1,0,0,0,0
5910,0,i don't care 'bout anything but you . . . what...,"[[0.11891, 0.15255, -0.082073, -0.74144, 0.759...",1,0,0,0,0
6068,0,"seriously , you guys , ask me .","[[0.24542, -0.74245, 0.32527, -0.53353, 0.1379...",1,0,0,0,0


In [10]:
train.head()

Unnamed: 0,label,text,glove,0,1,2,3,4
5157,0,"hey , big tuna ! you're single right ?","[[-0.7001, 0.36781, 0.34424, -0.42318, -0.0460...",1,0,0,0,0
5159,0,she's pretty hot huh ? she's completely crazy ...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1,0,0,0,0
5161,0,ooook .,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1,0,0,0,0
5188,0,ok . who put my calculator in jello ? good one...,"[[-0.53646, -0.072432, 0.24182, 0.099021, 0.18...",1,0,0,0,0
5243,0,i didn't mean you should do it . you were supp...,"[[0.11891, 0.15255, -0.082073, -0.74144, 0.759...",1,0,0,0,0
