In [1]:
# --- Preprocessing ---
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import string
from nltk.corpus import stopwords
import collections
import numpy as np

# --- Processing ---
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential#, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dense, Dropout, Convolution1D, MaxPooling1D, SpatialDropout1D, Input 
from keras.layers import GlobalMaxPooling1D, concatenate, LSTM, Bidirectional
from keras.optimizers import Adam

# --- Postprocessing ---

Using TensorFlow backend.


In [3]:
# Loading the Dataset
df = pd.read_csv('1train_data.csv', 
                 encoding='ISO-8859-1')
pd.set_option('display.max_colwidth', -1)
df.tail()

Unnamed: 0,sentiment,content
29995,happiness,I had a great date last night...tried to find the CDCaves with Daniel it was HILARIOUSLY FUN!!!
29996,sadness,With alex
29997,happiness,@fureousangel that is comedy good luck my friend!
29998,fun,"stephs grad party gr8! shoved cake in her face, watchd sis bitch slap a boy, ate good food satisfied"
29999,happiness,@jesfive SWEEEEET - San Fran is awesome!!!! Love it there


In [5]:
# Targets
# 
# negative: 0, neutral: 1, positive: 2
le = LabelEncoder()
df['target'] = le.fit_transform(df['sentiment'])
df.tail()

Unnamed: 0,sentiment,content,target
29995,happiness,I had a great date last night...tried to find the CDCaves with Daniel it was HILARIOUSLY FUN!!!,5
29996,sadness,With alex,10
29997,happiness,@fureousangel that is comedy good luck my friend!,5
29998,fun,"stephs grad party gr8! shoved cake in her face, watchd sis bitch slap a boy, ate good food satisfied",4
29999,happiness,@jesfive SWEEEEET - San Fran is awesome!!!! Love it there,5


In [6]:
# Tweet Cleaning
# 
def clean_text(df,text_field):
    df[text_field] = df[text_field].str.lower()
    mystring = r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
    # the first filter is capture everything eclosed that starts with an @
    # and that has any single character from the range A-Z or a-z or 0-9 with more
    # than one element inside of that
    # the second filter is capture everything that is not a number, a lower or upper
    # case letter after the last line
    return df[text_field].apply(lambda element: re.sub(mystring,"",element))

df['clean_tweet'] = clean_text(df,'content')
df.tail()

Unnamed: 0,sentiment,content,target,clean_tweet
29995,happiness,i had a great date last night...tried to find the cdcaves with daniel it was hilariously fun!!!,5,i had a great date last nighttried to find the cdcaves with daniel it was hilariously fun
29996,sadness,with alex,10,with alex
29997,happiness,@fureousangel that is comedy good luck my friend!,5,that is comedy good luck my friend
29998,fun,"stephs grad party gr8! shoved cake in her face, watchd sis bitch slap a boy, ate good food satisfied",4,stephs grad party gr8 shoved cake in her face watchd sis bitch slap a boy ate good food satisfied
29999,happiness,@jesfive sweeeeet - san fran is awesome!!!! love it there,5,sweeeeet san fran is awesome love it there


In [7]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [8]:
df['tokenized'] = df['clean_tweet'].apply(lambda row: tokenize(row))
df.tail()

Unnamed: 0,sentiment,content,target,clean_tweet,tokenized
29995,happiness,i had a great date last night...tried to find the cdcaves with daniel it was hilariously fun!!!,5,i had a great date last nighttried to find the cdcaves with daniel it was hilariously fun,"[i, had, a, great, date, last, nighttried, to, find, the, cdcaves, with, daniel, it, was, hilariously, fun]"
29996,sadness,with alex,10,with alex,"[with, alex]"
29997,happiness,@fureousangel that is comedy good luck my friend!,5,that is comedy good luck my friend,"[that, is, comedy, good, luck, my, friend]"
29998,fun,"stephs grad party gr8! shoved cake in her face, watchd sis bitch slap a boy, ate good food satisfied",4,stephs grad party gr8 shoved cake in her face watchd sis bitch slap a boy ate good food satisfied,"[stephs, grad, party, gr8, shoved, cake, in, her, face, watchd, sis, bitch, slap, a, boy, ate, good, food, satisfied]"
29999,happiness,@jesfive sweeeeet - san fran is awesome!!!! love it there,5,sweeeeet san fran is awesome love it there,"[sweeeeet, san, fran, is, awesome, love, it, there]"


In [9]:
stop = set(stopwords.words('english'))
stop.update(['amp', 'rt', 'cc'])
stop = stop - set(['no', 'not'])

In [10]:
def remove_stopwords(row):
    return [t for t in row if t not in stop]

In [11]:
df['tokenized'] = df['tokenized'].apply(lambda row: remove_stopwords(row))
df.tail()

Unnamed: 0,sentiment,content,target,clean_tweet,tokenized
29995,happiness,i had a great date last night...tried to find the cdcaves with daniel it was hilariously fun!!!,5,i had a great date last nighttried to find the cdcaves with daniel it was hilariously fun,"[great, date, last, nighttried, find, cdcaves, daniel, hilariously, fun]"
29996,sadness,with alex,10,with alex,[alex]
29997,happiness,@fureousangel that is comedy good luck my friend!,5,that is comedy good luck my friend,"[comedy, good, luck, friend]"
29998,fun,"stephs grad party gr8! shoved cake in her face, watchd sis bitch slap a boy, ate good food satisfied",4,stephs grad party gr8 shoved cake in her face watchd sis bitch slap a boy ate good food satisfied,"[stephs, grad, party, gr8, shoved, cake, face, watchd, sis, bitch, slap, boy, ate, good, food, satisfied]"
29999,happiness,@jesfive sweeeeet - san fran is awesome!!!! love it there,5,sweeeeet san fran is awesome love it there,"[sweeeeet, san, fran, awesome, love]"


In [12]:
def update_vocab_counter(row):
    for word in row:
        vocab_counter[word] += 1
vocab_counter = collections.Counter()
df['tokenized'].apply(update_vocab_counter);
vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)
max_words = 5000
w2id = {w:i for i, w in enumerate(vocab[:max_words])}
w2id['unk'] = 0
def transform_to_ids(row):
    return [w2id[w] if w in w2id else w2id['unk'] for w in row]
df['tokenized_int'] = df['tokenized'].apply(lambda x: transform_to_ids(x))
df.tail()

Unnamed: 0,sentiment,content,target,clean_tweet,tokenized,tokenized_int
29995,happiness,i had a great date last night...tried to find the cdcaves with daniel it was hilariously fun!!!,5,i had a great date last nighttried to find the cdcaves with daniel it was hilariously fun,"[great, date, last, nighttried, find, cdcaves, daniel, hilariously, fun]","[48, 633, 34, 0, 100, 0, 3970, 0, 64]"
29996,sadness,with alex,10,with alex,[alex],[2025]
29997,happiness,@fureousangel that is comedy good luck my friend!,5,that is comedy good luck my friend,"[comedy, good, luck, friend]","[2276, 4, 262, 177]"
29998,fun,"stephs grad party gr8! shoved cake in her face, watchd sis bitch slap a boy, ate good food satisfied",4,stephs grad party gr8 shoved cake in her face watchd sis bitch slap a boy ate good food satisfied,"[stephs, grad, party, gr8, shoved, cake, face, watchd, sis, bitch, slap, boy, ate, good, food, satisfied]","[0, 1823, 275, 2584, 0, 741, 414, 0, 1437, 1089, 4596, 454, 626, 4, 276, 0]"
29999,happiness,@jesfive sweeeeet - san fran is awesome!!!! love it there,5,sweeeeet san fran is awesome love it there,"[sweeeeet, san, fran, awesome, love]","[0, 1214, 0, 136, 19]"


In [13]:
lens = df['tokenized_int'].apply(lambda x: len(x))

In [14]:
min(lens), max(lens), np.mean(lens)

(0, 25, 7.495733333333333)

In [15]:
maxlen = 24

<h2> Processing </h2>

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_int'].values,
                                                    df['target'].values, 
                                                    test_size=0.25, 
                                                    random_state=0)

In [17]:
x_train = pad_sequences(X_train, maxlen=maxlen, value=0)
x_test = pad_sequences(X_test, maxlen=maxlen, value=0)

In [18]:
dummy_y = np_utils.to_categorical(y_train)
dummy_y_test = np_utils.to_categorical(y_test)

In [19]:
mydim = len(list(set(df['target'])))
print(mydim)
def conv_model():
    model = Sequential([Embedding(input_dim=max_words, output_dim=32, input_length=maxlen),
                        Convolution1D(10, mydim, padding='same', activation='relu'),
                        MaxPooling1D(),
                        Flatten(),
                        Dense(50, activation='relu'),
                        Dense(mydim, activation='softmax')])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

13


In [20]:
estimator = KerasClassifier(build_fn=conv_model, epochs=10, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [21]:
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [22]:
results.mean()*100, results.std()*100

(25.977919775026624, 0.5245573718838871)

In [23]:
results

array([0.26326304, 0.25005552, 0.26244445, 0.25861685, 0.26451613])