In [1]:
import nltk
#nltk.download() #check that everything is installed on the PC and uptodate

In [2]:
# toggles
# stack_to_save is for saving the stack model classifier or Training Model once it is run
# stack_to_load is for using a prior saved stack classifier or Training Model instead of training it again

stack_to_save = 1 # must be 0 or 1
stack_to_load = 0 # must be 0 or 1, cant be same as stack_to_save


In [3]:
# pandas dataframes to hold the tweets
import pandas as pd
import numpy as np
from IPython.display import display

%matplotlib inline
pd.options.display.max_columns = None

# use nltk for the natural language processing
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import pickle

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

# deep learning
import keras
from keras import layers
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, GlobalAveragePooling1D
from keras.models import load_model


# # filter out all warnings, leave this commented out, interested to see the warnings
# import warnings
# warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [4]:
# use random_state of 0 to stay consistent when training models are run
# and therefore able to detect changes in the output
random_state = 0

In [5]:
# check on the panda version and its dependencies
# i run this from time to time to ensure all is up to date
pd.__version__
#pd.show_versions()

'0.25.3'

In [6]:
# upload the training data into a dataframe, kaggle https://www.kaggle.com/kazanova/sentiment140
# add the column names
df_train = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = "ISO-8859-1",header=None)
df_train.columns = ["target", "id", "date", "flag", "user", "text"]

In [7]:
#
# exploring the training data
#

In [8]:
df_train.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
# there is no missing data
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
target    1600000 non-null int64
id        1600000 non-null int64
date      1600000 non-null object
flag      1600000 non-null object
user      1600000 non-null object
text      1600000 non-null object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [10]:
# target: 0 = negative, 4 = positive
df_train['target'].unique()

array([0, 4], dtype=int64)

In [11]:
# seems there is exactly 800k positive and 800k negative tweets, total tweets 1.6m
# i will change the 4 into a 1 later before i train the model
df_train['target'].value_counts()

4    800000
0    800000
Name: target, dtype: int64

In [12]:
#
# set up training for bag of words as a first try
#

In [13]:
# as a first step, i will need to clean the text
# step 1, clean each tweet to get rid of labels, @names, urls and put into lower case
# Step 2, filter out the stopwords and stem the rest
# step 3, convert the words to numbers for use in the training model
# step 4, pull out X and y for the model and pad them to make them all the same length
# step 5, split the data into training and testing date
# step 6, build the model
# step 7, train the model
# step 8, evaluate the model


In [14]:
with pd.option_context('display.max_colwidth', 140):
    print(df_train['text'][0:5])

0    @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
1        is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
2                              @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
3                                                                        my whole body feels itchy and like its on fire 
4        @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
Name: text, dtype: object


In [15]:
# step 1

text_cleaned = []

for i in range(len(df_train.index)):
    tweet_to_clean = df_train['text'][i]
    tweet_to_clean = re.sub(r'#([^\s]+)', r'\1', tweet_to_clean) # all #hashtag goes to hashtag
    tweet_to_clean = re.sub('@[^\s]+','', tweet_to_clean) # all @name deleted
    tweet_to_clean = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet_to_clean) # all urls deleted
    tweet_to_clean = re.sub('[^a-zA-z0-9\s]','',tweet_to_clean) # punctuation and special characters now cleared
    tweet_to_clean = re.sub('[\s]+', ' ', tweet_to_clean) # converts all the resulting whitespace to one space only
    tweet_to_clean = tweet_to_clean.lower() # convert all to lower case
    
    text_cleaned.append(tweet_to_clean)

df_train['text_cleaned'] = text_cleaned # new dataframe column with the cleaned text

In [16]:
with pd.option_context('display.max_colwidth', 140):
    print (df_train['text_cleaned'][0:5])


0                                      awww thats a bummer you shoulda got david carr of third day to do it d
1    is upset that he cant update his facebook by texting it and might cry as a result school today also blah
2                                i dived many times for the ball managed to save 50 the rest go out of bounds
3                                                             my whole body feels itchy and like its on fire 
4                      no its not behaving at all im mad why am i here because i cant see you all over there 
Name: text_cleaned, dtype: object


In [17]:
# i will also need to change the column headers in case they conflict with the feature columns later
# i.e. if a feature word is 'text' or 'date' or 'flag' etc
# add'__' to each column header

for i in range(6):
    df_train.rename(columns={df_train.columns[i]: df_train.columns[i]+'__'}, inplace=True)
    
df_train.head()

Unnamed: 0,target__,id__,date__,flag__,user__,text__,text_cleaned
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats a bummer you shoulda got david car...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sa...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i he...


In [18]:
# step 2

stop_words = stopwords.words("english")
stemmer = PorterStemmer()

In [19]:
tweets_filtStem = []

for i in range(len(df_train.index)):
    tweets_toFilter = df_train['text_cleaned'][i]
    words = word_tokenize(tweets_toFilter)

    new_tweet = []
    for w in words:
        if w not in stop_words:
            new_tweet.append(stemmer.stem(w))
    tweets_filtStem.append(new_tweet)

df_train['text_stopwordsRemoved'] = tweets_filtStem # new dataframe column with the filtered text

In [20]:
with pd.option_context('display.max_colwidth', 140):
    print(df_train['text_stopwordsRemoved'][0:5])

0                            [awww, that, bummer, shoulda, got, david, carr, third, day]
1    [upset, cant, updat, facebook, text, might, cri, result, school, today, also, blah]
2                             [dive, mani, time, ball, manag, save, 50, rest, go, bound]
3                                                 [whole, bodi, feel, itchi, like, fire]
4                                                            [behav, im, mad, cant, see]
Name: text_stopwordsRemoved, dtype: object


In [21]:
df_train['text_stopwordsRemoved'].map(len).sum()

11697422

In [22]:
# step 3

max_features = 5000 # keep the 5000 most common words to use as features
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df_train['text_stopwordsRemoved'].values)
df_train['tokenized'] = tokenizer.texts_to_sequences(df_train['text_stopwordsRemoved'].values)


In [23]:
# and save the tokenizer for use when loading the model to use later

with open('tokenizer_keras_twitter.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
# step 4

max_len = 70 # this should cover all tweets

X = df_train['tokenized']
X = pad_sequences(X, maxlen=max_len)

df_train['model_target'] = df_train['target__'].replace([4],[1])

y = df_train['model_target']


In [25]:
len(X[0]), X[0]

(70, array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,  399,   51, 1081, 3056,
           9,  720, 1671,    4]))

In [26]:
# step 5

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
X_train.shape,y_train.shape, X_test.shape,y_test.shape

((1200000, 70), (1200000,), (400000, 70), (400000,))

In [27]:
# step 6

#Define Keras sequential classifier
model = keras.Sequential()

In [28]:
# # 120 batch size, 5 epochs gets 52% accuracy

# batch_size = 120
# epochs = 5

# model.add(Dense(max_features, activation='relu', input_dim=X.shape[1]))
# model.add(Dropout(0.1))
# model.add(Dense(1, activation='sigmoid'))


In [29]:
# LSTM model
# from the Keras documentation
# gets 79% accuracy with 120 batch and 5 epochs.  32 batch = 79%, 10 epochs with 120 batch gets 79%

embed_dim = 128
#lstm_out = 20
batch_size = 120
epochs = 5

model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(1, activation='sigmoid'))

In [30]:
# # Convolution1D model - enhanced
# # gets 79% accuracy with 34 batch and 5 epochs

# # set parameters:
# max_features = max_features
# maxlen = maxlen
# batch_size = 34
# embedding_dims = 50
# filters = 250
# kernel_size = 3
# hidden_dims = 250
# #hidden_dims = 150

# epochs = 5

# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_features,
#                     embedding_dims,
#                     input_length=maxlen))
# model.add(Dropout(0.2))

# # we add a Convolution1D, which will learn filters
# # word group filters of size filter_length:
# model.add(Conv1D(filters,
#                  kernel_size,
#                  padding='valid',
#                  activation='relu',
#                  strides=1))

# model.add(Conv1D(filters, kernel_size, activation='relu'))

# model.add(MaxPooling1D(3))
# model.add(Conv1D(160, kernel_size, activation='relu'))
# model.add(Conv1D(160, kernel_size, activation='relu'))

# model.add(GlobalAveragePooling1D())

# # We project onto a single unit output layer, and squash it with a sigmoid:
# model.add(Dense(1))
# model.add(Activation('sigmoid'))


In [31]:
# # Convolution1D model
# # from the Keras documentation
# # gets 79% accuracy with 120 batch and 5 epochs

# # set parameters:
# max_features = max_features
# maxlen = maxlen
# batch_size = 120
# embedding_dims = 50
# filters = 250
# kernel_size = 3
# #hidden_dims = 250
# hidden_dims = 150

# epochs = 5

# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_features,
#                     embedding_dims,
#                     input_length=maxlen))
# model.add(Dropout(0.2))

# # we add a Convolution1D, which will learn filters
# # word group filters of size filter_length:
# model.add(Conv1D(filters,
#                  kernel_size,
#                  padding='valid',
#                  activation='relu',
#                  strides=1))
# # we use max pooling:
# model.add(GlobalMaxPooling1D())

# # We add a vanilla hidden layer:
# model.add(Dense(hidden_dims))
# model.add(Dropout(0.2))
# model.add(Activation('relu'))

# # We project onto a single unit output layer, and squash it with a sigmoid:
# model.add(Dense(1))
# model.add(Activation('sigmoid'))


In [32]:
# # Sentiment classification CNN-LSTM
# # from the Keras documentation
# # gets 70% accuracy with 120 batch and 5 epochs

# # Embedding
# max_features = max_features
# maxlen = max_len
# embedding_size = 128

# # Convolution
# kernel_size = 5
# filters = 64
# pool_size = 4

# # LSTM
# lstm_output_size = 70

# # Training
# batch_size = 120
# epochs = 5

# model.add(Embedding(max_features, embedding_size, input_length=maxlen))
# model.add(Dropout(0.25))
# model.add(Conv1D(filters,
#                  kernel_size,
#                  padding='valid',
#                  activation='relu',
#                  strides=1))
# model.add(MaxPooling1D(pool_size=pool_size))
# model.add(LSTM(lstm_output_size))
# model.add(Dense(1))
# model.add(Activation('sigmoid'))

In [33]:
# # fasttext for text classification
# # from the Keras documentation
# # batch size 32, 5 epochs gets 77% accuracy

# # Set parameters:
# # ngram_range = 2 will add bi-grams features
# ngram_range = 1
# max_features = 20000
# #maxlen = 400
# batch_size = 32
# embedding_dims = 50
# epochs = 5


# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_features,
#                     embedding_dims))

# # we add a GlobalAveragePooling1D, which will average the embeddings
# # of all words in the document
# model.add(GlobalAveragePooling1D())

# # We project onto a single unit output layer, and squash it with a sigmoid:
# model.add(Dense(1, activation='sigmoid'))


In [34]:
#summary report of classifier we have just built
print("Summary report of Keras classifier:") 
model.summary()

Summary report of Keras classifier:
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         640000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


In [35]:
# step 7

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [36]:
keras_result = model.fit(X_train, y_train, epochs = epochs, batch_size=batch_size)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [37]:
# Step 8

score = model.evaluate(X_test, y_test, batch_size=batch_size)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.7938525080680847
LOSS: 0.4415608602821827


In [38]:
# save the model

if stack_to_save == 1:
    print('saving the keras model')
    model.save('keras_twitter_model.h5')
    print('keras model saved')
else:
    print('model not saved')


saving the keras model
keras model saved
