## Install necessary libraries

In [2]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.3.1-cp36-cp36m-manylinux2010_x86_64.whl (320.4 MB)
[K     |████████████████████████████    | 279.9 MB 132.7 MB/s eta 0:00:01    | 107.4 MB 136.1 MB/s eta 0:00:02

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 320.4 MB 21 kB/s 
[?25hCollecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 9.4 MB/s s eta 0:00:01
Collecting astunparse==1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting tensorboard<3,>=2.3.0
  Downloading tensorboard-2.3.0-py3-none-any.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 87.6 MB/s eta 0:00:01
[?25hCollecting tensorflow-estimator<2.4.0,>=2.3.0
  Downloading tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
[K     |████████████████████████████████| 459 kB 84.4 MB/s eta 0:00:01
[?25hCollecting termcolor>=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
Collecting absl-py>=0.7.0
  Downloading absl_py-0.11.0-py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 116.3 MB/s eta 0:00:01
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting grpcio

In [3]:
!pip install keras

Collecting keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


## Import necessary libraries

In [4]:
import numpy as np
import pandas as pd
import boto3
import tensorflow as tf
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
#from keras.layers.embeddings import Embedding
#from keras.preprocessing import sequence

## Read the tidy (lemmatized) data into pandas dataframe

In [5]:
# Read the data into pandas dataframe
client = boto3.client('s3')
path = 's3://bauka-big-tweets/lemmatized.json'
df= pd.read_json(path, orient='table')

In [6]:
df.head()

Unnamed: 0,text,lemmatized,tokens_back_to_text
0,"[rudygiuliani, competing, town, halls, joke, s...","[rudygiuliani, compete, town, hall, joke, step...",rudygiuliani compete town hall joke stephanopo...
1,"[trump, machismo]","[trump, machismo]",trump machismo
2,"[briantylercohen, biden, like, encyclopedia, t...","[briantylercohen, biden, like, encyclopedia, t...",briantylercohen biden like encyclopedia tonigh...
3,"[bradleywhitford, yo, semites, qanon, hate, de...","[bradleywhitford, yo, semite, qanon, hate, dem...",bradleywhitford yo semite qanon hate democrat ...
4,"[actbrigitte, president, trump, debate]","[actbrigitte, president, trump, debate]",actbrigitte president trump debate


In [7]:
# Consolidate the tweets into a single list of documents which is a required format for sklearn CountVectorizer
docs = [doc for doc in df['tokens_back_to_text']]

In [8]:
len(docs)

407766

## Word2Vec

In [9]:
# Extract small batch from the large data set
small_batch = docs[0:10000]
# import and instantiate CountVectorizer (with sklearn)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer() # instantiate vectorizer
vect.fit(small_batch) # fit the vectorizer
dtm = vect.transform(small_batch) # transform the data into word2vec
dtm # spare matrix of vectorized data

<10000x8239 sparse matrix of type '<class 'numpy.int64'>'
	with 96000 stored elements in Compressed Sparse Row format>

In [10]:
# Convert the word2vec into numpy array
tidy = dtm.toarray()
type(tidy)

numpy.ndarray

In [11]:
tidy.shape

(10000, 8239)

## KMeans clustering - label the data 

In [12]:
# KMeans clustering to label the data into classes
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=43)
kmeans.fit(tidy)
labels = kmeans.predict(tidy)

In [13]:
type(labels)

numpy.ndarray

In [14]:
len(labels)

10000

In [15]:
# One-hot encoding of labels
from keras.utils.np_utils import to_categorical
labels = to_categorical(labels)

In [16]:
type(labels)

numpy.ndarray

In [17]:
labels.shape

(10000, 5)

In [18]:
# Extract the number of columns
n_cols = tidy.shape[1]

## Deep Learning model

In [19]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(tidy, labels, test_size=0.3, random_state=43)

In [20]:
# Specify architecture
model = Sequential()
model.add(Dense(200, activation='relu', input_shape = (n_cols,)))
model.add(Dense(200, activation='relu'))
model.add(Dense(5, activation='softmax'))
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# Fit the model to the training set
model.fit(x_train, y_train, epochs=3)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               1648000   
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 1005      
Total params: 1,689,205
Trainable params: 1,689,205
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 99.57%
