In [46]:
import re
import pandas as pd
import spacy
from keras.utils import np_utils

nlp = spacy.load('en')

# Borrowed from @PihtaHorse

In [47]:
def remove_characters_from_string(s, old_characters_list=['\n', '\t'], new_characters_list=None):
    if not new_characters_list:
        new_characters_list = [' '] * len(old_characters_list)
    
    for old_c, new_c in zip(old_characters_list, new_characters_list):
        s = s.replace(old_c, new_c)
    
    return s

def remove_extra_spaces(s):
    return ' '.join(s.split())

In [48]:
money_regex = r'\$[0-8.,]+'

def money_replace(s):
    return re.sub(money_regex, ' money_regex ', s)

In [49]:
dots_regex = r'\.[.]+'

def dots_replace(s):
    return re.sub(dots_regex, ' ... ', s)

In [50]:
def clear_string(s):
    s = remove_characters_from_string(s,
                                      ['\\n', '\\"', '-', '"', "\\", ')', '(', '/'], 
                                      [' ', ' ', ' - ', ' ', '', ' ) ', ' ( ', ' / '])
    s = money_replace(s)
    s = dots_replace(s)
    s = remove_extra_spaces(s)
    return s

In [51]:
def get_space_tokens(s):
    s = clear_string(s)
    return nlp(s)

def text_from_spacy_tokens(tokens):
    tokens = [token.lower_ for token in tokens]
    return ' '.join(tokens)

In [52]:
train = pd.read_csv('../data/train.data', sep='\t', index_col='Id')
test = pd.read_csv('../data/test.data', sep='\t', index_col='Id')

In [53]:
train.head()

Unnamed: 0_level_0,Sentiment,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,Incredibly disappointing service. I mean reall...
1,2,I'm not really sure what just happened to me. ...
2,4,Yum yum. My bf and I came here to celebrate ou...
3,4,I wish Fujiya was closer to me now that I'm no...
4,4,I work down the street from this location and ...


In [54]:
train.Text = train.Text.apply(get_space_tokens).apply(text_from_spacy_tokens)
test.Text = test.Text.apply(get_space_tokens).apply(text_from_spacy_tokens)

In [55]:
train.head()

Unnamed: 0_level_0,Sentiment,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,incredibly disappointing service . i mean real...
1,2,i 'm not really sure what just happened to me ...
2,4,yum yum . my bf and i came here to celebrate o...
3,4,i wish fujiya was closer to me now that i 'm n...
4,4,i work down the street from this location and ...


In [56]:
Y_train = np_utils.to_categorical(train.Sentiment)

for i in range(1, 6):
    train.insert(loc=len(train.columns), column=str(i), value=Y_train[:, i])

In [58]:
train.head()

Unnamed: 0_level_0,Sentiment,Text,1,2,3,4,5
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,incredibly disappointing service . i mean real...,1.0,0.0,0.0,0.0,0.0
1,2,i 'm not really sure what just happened to me ...,0.0,1.0,0.0,0.0,0.0
2,4,yum yum . my bf and i came here to celebrate o...,0.0,0.0,0.0,1.0,0.0
3,4,i wish fujiya was closer to me now that i 'm n...,0.0,0.0,0.0,1.0,0.0
4,4,i work down the street from this location and ...,0.0,0.0,0.0,1.0,0.0


In [60]:
train.to_csv('../data/spacy_train.csv')
test.to_csv('../data/spacy_test.csv')