## Install necessary libraries

In [13]:
# Need this to read parquet file format into pandas dataframe
!pip install pyarrow

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m


In [14]:
!pip install imbalanced-learn

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m


## Import necessary libraries

In [16]:
import numpy as np
import pandas as pd
import boto3
import tensorflow as tf
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import time
import warnings

#from keras.preprocessing import sequence

In [2]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 17037567901941032771,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 10537375238087818236
 physical_device_desc: "device: XLA_CPU device",
 name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 13540715296108806658
 physical_device_desc: "device: XLA_GPU device",
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11338085172
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 10512267807107589793
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0, compute capability: 3.7"]

## Read the labeled data into pandas dataframe

In [35]:
# Read the labeled data into the pandas dataframe
part0 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00000-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
part1 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00001-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
part2 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00002-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
part3 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00003-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
part4 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00004-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
part5 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00005-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
part6 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00006-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
part7 = 's3://bauka-big-tweets/tweets_2class.parquet/part-00007-0b07c410-f74a-444d-b425-a1e1bbcb1684-c000.snappy.parquet'
data0 = pd.read_parquet(part0, engine='pyarrow')
data1 = pd.read_parquet(part1, engine='pyarrow')
data2 = pd.read_parquet(part2, engine='pyarrow')
data3 = pd.read_parquet(part3, engine='pyarrow')
data4 = pd.read_parquet(part4, engine='pyarrow')
data5 = pd.read_parquet(part5, engine='pyarrow')
data6 = pd.read_parquet(part6, engine='pyarrow')
data7 = pd.read_parquet(part7, engine='pyarrow')
labeled_data = pd.concat([data0, data1, data2, data3, data4, data5, data6, data7])

In [19]:
# Read the tidy (lemmatized) data into pandas dataframe
client = boto3.client('s3')
path = 's3://bauka-big-tweets/lemmatized.json'
df= pd.read_json(path, orient='table')

In [21]:
tweets = df['tokens_back_to_text']
type(tweets)

pandas.core.series.Series

## Pre-processing

### Feature space

In [99]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(tweets)

In [100]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 65125 unique tokens.


In [101]:
max_features = len(word_index)+1

In [102]:
sequences = tokenizer.texts_to_sequences(tweets)

In [120]:
maxlen = 12

In [121]:
data = pad_sequences(sequences, maxlen = maxlen)

### Target space

In [122]:
labels = labeled_data['prediction']

In [123]:
labels.value_counts()

0    407662
1       104
Name: prediction, dtype: int64

### SMOTE - balance the data set

In [124]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
# define pipeline
over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.8)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X, y = pipeline.fit_resample(data, labels)

In [125]:
from collections import Counter
counter = Counter(y)
print(counter)

Counter({0: 152872, 1: 122298})


In [126]:
# One-hot encoding of labels
from keras.utils.np_utils import to_categorical
y = to_categorical(y)

In [127]:
X.shape, y.shape

((275170, 12), (275170, 2))

### Split the dataset into training and test sets

In [128]:
# Shuffle the data set first
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

In [129]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=43, stratify=y)

In [130]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((137585, 12), (137585, 12), (137585, 2), (137585, 2))

## Modeling - deep learning model

In [131]:
# SimpleRNN
warnings.filterwarnings("ignore")
model = Sequential()
model.add(Embedding(input_dim = max_features, output_dim = 100, input_length = maxlen))
model.add(SimpleRNN(output_dim=100, return_sequences=True))
model.add(SimpleRNN(output_dim=100, return_sequences=True))
model.add(SimpleRNN(output_dim=100, return_sequences=True))
model.add(SimpleRNN(output_dim=100))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='rmsprop', metrics = ['accuracy'])

In [132]:
# Fit the model to the training set
warnings.filterwarnings("ignore")
model.fit(X_train, y_train, epochs=3, validation_split=0.6)
# Final evaluation of the model
warnings.filterwarnings("ignore")
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 55034 samples, validate on 82551 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 100.00%


In [135]:
# LSTM
warnings.filterwarnings("ignore")
model = Sequential()
model.add(Embedding(input_dim = max_features, output_dim = 100, input_length = maxlen))
model.add(LSTM(output_dim=100))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='rmsprop', metrics = ['accuracy'])

In [136]:
# Fit the model to the training set
warnings.filterwarnings("ignore")
model.fit(X_train, y_train, epochs=3, validation_split=0.3)
# Final evaluation of the model
warnings.filterwarnings("ignore")
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 96309 samples, validate on 41276 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 100.00%


In [137]:
!nvidia-smi

Tue Nov  3 20:38:38 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
| N/A   49C    P0    57W / 149W |  10963MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------