In [1]:
from typing import List, Tuple
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from process import *
from nn.MLP import MLPNetwork
from coding import Coding
from utils import get_train_data
%load_ext autoreload
%autoreload 2

In [2]:
train_df = get_train_data()
train_df['processed_tokens'] = train_df.full_text.apply(tokenize_and_remove_punkt).apply(stem).apply(lem)

Loading from file: tweets_apple.csv


## Make dictionary

In [3]:
coder = Coding()
_ = train_df.processed_tokens.apply(coder.update)

## Encode twitts using dictionary and unify lenghts to num_features

In [4]:
# target number of words of after resizing every tweet 
target_num_words = 20
min_occ = 5

In [5]:
coder.compile(min_threshold=min_occ)

{'1': 132,
 '10': 148,
 '12': 94,
 '14': 17,
 '2': 133,
 '2018': 178,
 '3': 160,
 '32gb': 98,
 '4': 114,
 '42mm': 83,
 '5': 115,
 '58': 5,
 'A': 58,
 'AI': 28,
 'AR': 6,
 'I': 189,
 'It': 140,
 'RT': 199,
 'To': 64,
 'We': 37,
 'a': 192,
 'about': 166,
 'access': 172,
 'account': 31,
 'ahead': 141,
 'air': 3,
 'airpow': 2,
 'all': 90,
 'allow': 59,
 'amazon': 164,
 'amp': 41,
 'an': 66,
 'and': 197,
 'android': 88,
 'announc': 78,
 'anoth': 42,
 'app': 136,
 'appl': 205,
 'applemus': 40,
 'around': 16,
 'as': 82,
 'at': 184,
 'automobil': 7,
 'band': 45,
 'be': 198,
 'becaus': 13,
 'blackberri': 24,
 'breach': 4,
 'broad': 70,
 'by': 150,
 'can': 74,
 'car': 27,
 'co': 202,
 'coffe': 104,
 'compani': 10,
 'confer': 1,
 'core': 60,
 'could': 124,
 'data': 187,
 'day': 163,
 'develop': 153,
 'devic': 151,
 'donut': 12,
 'doubt': 35,
 'download': 100,
 'enough': 123,
 'expect': 159,
 'facebook': 191,
 'fix': 14,
 'food': 109,
 'for': 190,
 'free': 121,
 'from': 156,
 'full': 63,
 'get': 9

In [6]:
train_df['coded_tokens'] = (
    train_df
    .processed_tokens
    .apply(lambda l: [coder.encode_final(tok) for tok in l])
    .apply(partial(pad_or_truncate, target_len=target_num_words, end=True, pad_value=0))
    .apply(np.array)
)

In [6]:
train_df.sample(n=10)

Unnamed: 0,full_text,score,processed_tokens,coded_tokens
24,Amittrajit Ghosh And Ashwin Naik Become First ...,3.0,"[amittrajit, ghosh, and, ashwin, naik, becom, ...","[0, 0, 35, 0, 0, 0, 0, 0, 38, 281, 0, 0, 0, 0,..."
203,Hailee Steinfeld To Play Comedy Legend Emily D...,3.0,"[haile, steinfeld, To, play, comedi, legend, e...","[0, 0, 281, 0, 0, 0, 0, 0, 0, 7, 219, 9, 10, 1..."
143,RT @fabsanchezp: #WWDC2018 #iOS12 #macOS14\r\n...,3.0,"[RT, fabsanchezp, wwdc2018, ios12, macos14, th...","[16, 0, 41, 117, 0, 0, 0, 86, 93, 7, 123, 58, ..."
187,RT @AR72014: #wallpaper #homescreen #lockscree...,3.0,"[RT, ar72014, wallpap, homescreen, lockscreen,...","[16, 0, 0, 0, 0, 91, 7, 41, 123, 21, 58, 4, 31..."
65,RT @arthr: Will there be a multi-core/32gb ram...,4.0,"[RT, arthr, will, there, be, a, multi, core, 3...","[16, 0, 211, 588, 58, 179, 589, 199, 202, 204,..."
68,✪ WWDC 2018: How to Watch #Apple's Keynote on ...,3.0,"[wwdc, 2018, how, to, watch, appl, s, keynot, ...","[31, 32, 352, 57, 241, 7, 38, 39, 6, 0, 0, 129..."
76,"#Apple security updates, #iOS and #macOS now s...",3.0,"[appl, secur, updat, io, and, maco, now, suppo...","[7, 584, 36, 91, 35, 417, 107, 663, 545, 105, ..."
243,Apple Shares Animoji Karaoke Ad on its Main Yo...,3.0,"[appl, share, animoji, karaok, Ad, on, it, mai...","[7, 75, 0, 0, 0, 6, 87, 0, 0, 0, 900, 70, 31, ..."
247,RT @JurassicApps: Dinosaur Assassin: I-Evoluti...,3.0,"[RT, jurassicapp, dinosaur, assassin, I, evolu...","[16, 0, 0, 0, 234, 0, 83, 488, 489, 107, 57, 4..."
105,🌹🌹🌹🌹I give these flowers to #Apple 🌹🌹🌹🌹🌹🌹🌹🌹🌹🌹🌹...,5.0,"[I, give, these, flower, to, appl, http, t, co...","[234, 518, 0, 0, 57, 7, 9, 10, 11, 0, 0, 0, 0,..."


In [7]:
train_df.coded_tokens.tolist()[5]

array([ 0, 35, 68,  4,  0, 70, 71,  0,  0,  0, 75, 76, 71,  0,  0,  9, 10,
       11,  0, 70])

# Embeddings

In [8]:
from keras.models import Sequential
from keras.layers import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [16]:
len(set(final_dict.values()))

206

In [17]:
coder.len_between(threshold_min=min_occ)

206

In [19]:
model = Sequential()
model.add(Embedding(coder.len_between(threshold_min=min_occ)+1, 64))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

# input_array = np.random.randint(1000, size=(32, 10))
model.compile('rmsprop', 'mse')

In [18]:
output_array = model.predict()
assert output_array.shape == (32, 10, 64)

NameError: name 'input_array' is not defined

## Training

In [23]:
from sklearn.model_selection import train_test_split 

In [296]:
X_train, X_test, y_train, y_test = train_test_split(train_df.coded_tokens.tolist(), 
                                                    train_df.score.tolist(), test_size=0.3)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [297]:
def simplify(score):
    """ Reduce number of classes """
    if score < 3.0:
        return 0
    elif score > 3.0:
        return 2
    else: return 1

## Set training parameters
* num_features -> this will mean number of neurons in input layer as well as number of coded tokens in each input tweet passed to network
* num_classes -> number of classes, number of neurons in output layer of network
* num_hidden_neurons -> number of neurons in hidden layer
* num_expamles -> number of examples in dataset

In [7]:
num_features = 30
num_classes = len(train_df.score.unique())
num_hidden_neurons = 400
num_examples = len(train_df)

In [298]:
net = MLPNetwork(num_classes=3,
                num_examples=len(X_train),
                num_features=num_features,
                num_hidden_neurons={1 : 300})

In [299]:
y_train

array([4., 3., 3., 2., 5., 3., 4., 3., 3., 2., 5., 3., 3., 4., 4., 3., 1.,
       3., 4., 1., 3., 3., 4., 5., 3., 5., 2., 2., 2., 3., 3., 3., 1., 3.,
       3., 5., 3., 5., 3., 4., 3., 3., 2., 4., 3., 5., 3., 4., 3., 3., 1.,
       4., 3., 3., 3., 1., 1., 3., 3., 3., 4., 3., 4., 4., 4., 5., 3., 3.,
       3., 4., 4., 3., 3., 5., 5., 3., 2., 3., 3., 3., 3., 4., 4., 3., 5.,
       3., 4., 5., 3., 1., 3., 2., 3., 4., 5., 3., 2., 3., 4., 2., 3., 3.,
       3., 2., 5., 4., 4., 3., 3., 5., 3., 2., 2., 3., 4., 3., 4., 3., 3.,
       5., 3., 4., 3., 1., 5., 3., 4., 4., 1., 3., 4., 2., 4., 3., 4., 3.,
       3., 4., 3., 3., 3., 3., 3., 4., 3., 3., 4., 3., 4., 3., 5., 3., 3.,
       4., 3., 1., 3., 2., 3., 3., 4., 4., 3., 4., 4., 3., 3., 3., 3., 3.,
       3., 4., 3., 1., 5.])

In [300]:
print(X_train.shape)

(175, 30)


In [313]:
# models = net.fit(X_train,y_train, batches=10000, print_loss=True)
mod = MLPClassifier(hidden_layer_sizes=(1000,1000), )

t = mod.fit(X_train, y_train)

In [314]:
good_sklearn = 0
good_my = 0
for i in range(len(y_test)):
    pred_sklearn = t.predict([X_test[i]])
#     pred_my = net.predict(X_train[i])
    if pred_sklearn == y_test[i]:
#         print(True)
        good_sklearn+=1
    if pred_my == y_train[i]:
        good_my += 1
   
# print(float(good_my)/len(y_train))        
print(float(good_sklearn)/len(y_train))

0.21142857142857144


In [173]:
good = 0
for i in range(len(y_test)):
    if net.predict(X_test[i]) == y_test[i]:
        print(True)
        good+=1
    else:
        print(False)
        
print(float(good)/len(y_test))

False
False
True
False
True
False
False
False
True
False
False
False
True
0.3076923076923077


In [126]:
train_df.loc[0]

full_text           MASAKI YODA -Carrying the future- on #Apple mu...
score                                                               4
processed_tokens    [masaki, yoda, carri, the, futur, on, appl, mu...
coded_tokens        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 8, 13,...
Name: 0, dtype: object

In [224]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_iris
import numpy as np
from keras.lay

In [226]:
X,Y = load_iris().data, load_iris().target

mlp = MLPClassifier()
mlp.fit(X, Y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [232]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [230]:
print(mlp.predict(np.array([3.1,  2.5,  8.4,  2.2]).reshape(-1,1)))
      

ValueError: shapes (4,1) and (4,100) not aligned: 1 (dim 1) != 4 (dim 0)

In [None]:
print(mlp.predict_proba([3.1,  2.5,  8.4,  2.2]))

In [228]:

print("sum: %f"%np.sum(mlp.predict_proba([3.1,  2.5,  8.4,  2.2])))

SyntaxError: invalid syntax (<ipython-input-228-2829967ea018>, line 2)