In [70]:
from typing import List, Tuple
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from process import *
from nn.MLP import MLPNetwork
from coding import Coding
from utils import get_train_data, simplify
from sklearn.model_selection import train_test_split 
from keras.utils import to_categorical
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import data and preprocess it by applying stemming and lemming

In [53]:
train_df = get_train_data()
train_df['processed_tokens'] = train_df.full_text.apply(tokenize_and_remove_punkt).apply(stem).apply(lem)

Loading from file: tweets_apple.csv


## Make dictionary

In [54]:
coder = Coding()
_ = train_df.processed_tokens.apply(coder.update)

## Set unified number of words to pad or truncate each twitt

In [55]:
target_num_words = 20

## Set number of miminal occurrences of word in dict to encode

In [56]:
min_occ = 5

## Create dict based on aceding occurrences value

In [57]:
_ = coder.compile(min_threshold=min_occ)

## Encode and normalize data

In [58]:
train_df['coded_tokens'] = (
    train_df
    .processed_tokens
    .apply(lambda l: [coder.encode_final(tok) for tok in l])
    .apply(partial(pad_or_truncate, target_len=target_num_words, end=True, pad_value=0))
    .apply(np.array)
)

## Get probe of data

In [59]:
train_df.sample(n=10)

Unnamed: 0,full_text,score,processed_tokens,coded_tokens
233,RT @the_best_daily: 42% OFF #sale #save #apple...,3.0,"[RT, the_best_daili, 42, off, sale, save, appl...","[199, 0, 0, 126, 72, 73, 205, 170, 0, 30, 164,..."
95,When Tim Cook claimed using personal data wasn...,1.0,"[when, tim, cook, claim, use, person, data, wa...","[0, 0, 0, 0, 89, 117, 187, 0, 204, 0, 194, 205..."
180,"#Facebook removes trending news, #Apple’s upco...",3.0,"[facebook, remov, trend, news, appl, s, upcom,...","[191, 0, 0, 49, 205, 196, 0, 182, 24, 196, 179..."
218,enjoy #Apple spotlights young #developers ahe...,4.0,"[enjoy, appl, spotlight, young, develop, ahead...","[0, 205, 91, 92, 153, 141, 194, 195, 203, 204,..."
209,RT @JanRomes: If Apple iBooks is how you read ...,3.0,"[RT, janrom, If, appl, ibook, be, how, you, re...","[199, 0, 0, 205, 0, 198, 84, 171, 32, 0, 0, 85..."
123,5% OFF #sale #save #ipad #apple @amazon @apple...,3.0,"[5, off, sale, save, ipad, appl, amazon, appl,...","[115, 126, 72, 73, 97, 205, 164, 205, 205, 97,..."
81,Time to set 15 alarms to ensure I wake up 🚨 #W...,4.0,"[time, to, set, 15, alarm, to, ensur, I, wake,...","[158, 201, 0, 0, 0, 201, 0, 189, 0, 29, 182, 2..."
77,RT @BGRIndia: #WWDC2018: #Apple to introduce s...,3.0,"[RT, bgrindia, wwdc2018, appl, to, introduc, s...","[199, 0, 175, 205, 201, 0, 145, 6, 0, 15, 0, 0..."
98,#WWDC2018: #Apple to introduce shared AR platf...,3.0,"[wwdc2018, appl, to, introduc, share, AR, plat...","[175, 205, 201, 0, 145, 6, 0, 15, 0, 0, 113, 0..."
239,RT @DannyCheng: Apple Shares Animoji Karaoke A...,3.0,"[RT, dannycheng, appl, share, animoji, karaok,...","[199, 0, 205, 145, 0, 0, 0, 193, 174, 0, 0, 0,..."


# Words embeddings

In [60]:
from keras.models import Sequential
from keras.layers import Embedding

## Define embeddings sizes

In [61]:
embedding_size = 64

In [62]:
model = Sequential()
model.add(Embedding(coder.len_between(threshold_min=min_occ)+1, 
                    embedding_size, 
                    input_length=target_num_words))
model.compile('rmsprop', 'mse')

  if d.decorator_argspec is not None), _inspect.getargspec(target))


In [66]:
words_with_embeddings_arr = model.predict(np.array(train_df.coded_tokens.tolist()))
embeddings = [i[0] for i in words_with_embeddings_arr]


# Training

## Decide if reduce number of classes from 5 to 3

In [71]:
reduce = True

In [72]:
if reduce:
    scores = [simplify(s) for s in train_df.score.tolist()]
else:
    scores = train_df.score.tolist()    

## Apply one-hot encoding on classes

In [73]:
X = np.array(embeddings)

# one hot encoding
y = to_categorical(scores)

## Split dataset to train and test sets

In [75]:
test_size_percent = 0.15

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=test_size_percent, 
                                                    # ensure same split every run
                                                    random_state=42)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [82]:
print("Number of images in training_set: {}. Features: {}"
      .format(X_train.shape[0], X_train.shape[1]))

Number of images in training_set: 213. Features: 64


In [83]:
print("Number of classes: {}".format(y_train.shape[1]))

Number of classes: 3


## Training parameters
* num_features -> this will mean number of neurons in input layer as well as number of coded tokens in each input tweet passed to network
* num_classes -> number of classes, number of neurons in output layer of network
* num_hidden_neurons -> number of neurons in hidden layer
* num_expamles -> number of examples in dataset

In [22]:
num_features = X_train.shape[1]
num_classes = y_train.shape[1]
num_hidden_neurons = 1000
num_examples = len(X_train)

In [31]:
net = MLPNetwork(n_classes=num_classes,n_features=num_features,
                   n_hidden_units=num_hidden_neurons, epochs=10000)

In [32]:
net.fit(X_train, y_train)

100%|██████████| 10000/10000 [01:32<00:00, 108.00it/s]


<nn.MLP.MLPNetwork at 0x12b2626a0>

In [33]:
net.evaluate(X_train, y_train)

0.6171428571428571

In [34]:
net.evaluate(X_test, y_test)

0.6052631578947368