In [1]:
# --- Preprocessing ---
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import string
from nltk.corpus import stopwords
import collections
import numpy as np

# --- Processing ---
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential#, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dense, Dropout, Convolution1D, MaxPooling1D, SpatialDropout1D, Input 
from keras.layers import GlobalMaxPooling1D, concatenate, LSTM, Bidirectional
from keras.optimizers import Adam

# --- Postprocessing ---

Using TensorFlow backend.


In [2]:
# Loading the Dataset
df = pd.read_csv('train_tweets.csv', 
                 usecols=['tweet', 'label'], 
                 encoding='ISO-8859-1')
pd.set_option('display.max_colwidth', -1)
df.tail()

Unnamed: 0,label,tweet
31957,0,ate @user isz that youuu?Ã°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ¢ÂÂ¤Ã¯Â¸Â
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher
31959,0,listening to sad songs on a monday morning otw to work is sad
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act"
31961,0,thank you @user for you follow


In [3]:
# Targets
# 
# negative: 0, neutral: 1, positive: 2
le = LabelEncoder()
df['target'] = le.fit_transform(df['label'])
df.tail()

Unnamed: 0,label,tweet,target
31957,0,ate @user isz that youuu?Ã°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ°ÂÂÂÃ¢ÂÂ¤Ã¯Â¸Â,0
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher,0
31959,0,listening to sad songs on a monday morning otw to work is sad,0
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act",1
31961,0,thank you @user for you follow,0


In [4]:
# Tweet Cleaning
# 
def clean_text(df,text_field):
    df[text_field] = df[text_field].str.lower()
    mystring = r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
    # the first filter is capture everything eclosed that starts with an @
    # and that has any single character from the range A-Z or a-z or 0-9 with more
    # than one element inside of that
    # the second filter is capture everything that is not a number, a lower or upper
    # case letter after the last line
    return df[text_field].apply(lambda element: re.sub(mystring,"",element))

df['clean_tweet'] = clean_text(df,'tweet')
df.tail()

Unnamed: 0,label,tweet,target,clean_tweet
31957,0,ate @user isz that youuu?ã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã¢ââ¤ã¯â¸â,0,ate isz that youuu
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm shame imwithher
31959,0,listening to sad songs on a monday morning otw to work is sad,0,listening to sad songs on a monday morning otw to work is sad
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act",1,sikh temple vandalised in in calgary wso condemns act
31961,0,thank you @user for you follow,0,thank you for you follow


In [5]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [6]:
df['tokenized'] = df['clean_tweet'].apply(lambda row: tokenize(row))
df.tail()

Unnamed: 0,label,tweet,target,clean_tweet,tokenized
31957,0,ate @user isz that youuu?ã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã¢ââ¤ã¯â¸â,0,ate isz that youuu,"[ate, isz, that, youuu]"
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm shame imwithher,"[to, see, nina, turner, on, the, airwaves, trying, to, wrap, herself, in, the, mantle, of, a, genuine, hero, like, shirley, chisolm, shame, imwithher]"
31959,0,listening to sad songs on a monday morning otw to work is sad,0,listening to sad songs on a monday morning otw to work is sad,"[listening, to, sad, songs, on, a, monday, morning, otw, to, work, is, sad]"
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act",1,sikh temple vandalised in in calgary wso condemns act,"[sikh, temple, vandalised, in, in, calgary, wso, condemns, act]"
31961,0,thank you @user for you follow,0,thank you for you follow,"[thank, you, for, you, follow]"


In [7]:
stop = set(stopwords.words('english'))
stop.update(['amp', 'rt', 'cc'])
stop = stop - set(['no', 'not'])

In [8]:
def remove_stopwords(row):
    return [t for t in row if t not in stop]

In [9]:
df['tokenized'] = df['tokenized'].apply(lambda row: remove_stopwords(row))
df.tail()

Unnamed: 0,label,tweet,target,clean_tweet,tokenized
31957,0,ate @user isz that youuu?ã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã¢ââ¤ã¯â¸â,0,ate isz that youuu,"[ate, isz, youuu]"
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm shame imwithher,"[see, nina, turner, airwaves, trying, wrap, mantle, genuine, hero, like, shirley, chisolm, shame, imwithher]"
31959,0,listening to sad songs on a monday morning otw to work is sad,0,listening to sad songs on a monday morning otw to work is sad,"[listening, sad, songs, monday, morning, otw, work, sad]"
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act",1,sikh temple vandalised in in calgary wso condemns act,"[sikh, temple, vandalised, calgary, wso, condemns, act]"
31961,0,thank you @user for you follow,0,thank you for you follow,"[thank, follow]"


In [10]:
def update_vocab_counter(row):
    for word in row:
        vocab_counter[word] += 1
vocab_counter = collections.Counter()
df['tokenized'].apply(update_vocab_counter);
vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)
max_words = 5000
w2id = {w:i for i, w in enumerate(vocab[:max_words])}
w2id['unk'] = 0
def transform_to_ids(row):
    return [w2id[w] if w in w2id else w2id['unk'] for w in row]
df['tokenized_int'] = df['tokenized'].apply(lambda x: transform_to_ids(x))
df.tail()

Unnamed: 0,label,tweet,target,clean_tweet,tokenized,tokenized_int
31957,0,ate @user isz that youuu?ã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã°âââã¢ââ¤ã¯â¸â,0,ate isz that youuu,"[ate, isz, youuu]","[2247, 0, 0]"
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm shame imwithher,"[see, nina, turner, airwaves, trying, wrap, mantle, genuine, hero, like, shirley, chisolm, shame, imwithher]","[20, 0, 0, 0, 386, 3725, 0, 2864, 1043, 8, 0, 0, 909, 3650]"
31959,0,listening to sad songs on a monday morning otw to work is sad,0,listening to sad songs on a monday morning otw to work is sad,"[listening, sad, songs, monday, morning, otw, work, sad]","[949, 59, 1161, 207, 50, 0, 27, 59]"
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act",1,sikh temple vandalised in in calgary wso condemns act,"[sikh, temple, vandalised, calgary, wso, condemns, act]","[1408, 1173, 1446, 1261, 1447, 1448, 552]"
31961,0,thank you @user for you follow,0,thank you for you follow,"[thank, follow]","[92, 85]"


In [11]:
lens = df['tokenized_int'].apply(lambda x: len(x))

In [12]:
min(lens), max(lens), np.mean(lens)

(0, 24, 7.931637569613916)

In [23]:
maxlen = 24

<h2> Processing </h2>

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_int'].values,
                                                    df['target'].values, 
                                                    test_size=0.25, 
                                                    random_state=0)

In [25]:
x_train = pad_sequences(X_train, maxlen=maxlen, value=0)
x_test = pad_sequences(X_test, maxlen=maxlen, value=0)

In [26]:
dummy_y = np_utils.to_categorical(y_train)
dummy_y_test = np_utils.to_categorical(y_test)

In [37]:
mydim = len(list(set(df['target'])))
print(mydim)
def conv_model():
    model = Sequential([Embedding(input_dim=max_words, output_dim=32, input_length=maxlen),
                        Convolution1D(10, mydim, padding='same', activation='relu'),
                        MaxPooling1D(),
                        Flatten(),
                        Dense(50, activation='relu'),
                        Dense(mydim, activation='softmax')])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

2


In [38]:
estimator = KerasClassifier(build_fn=conv_model, epochs=10, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [39]:
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

Instructions for updating:
Use tf.cast instead.


In [40]:
results.mean()*100, results.std()*100

(94.57260587432825, 0.1998905012979528)

In [41]:
results

array([0.9464025 , 0.94263663, 0.94472257, 0.94618273, 0.94868586])