In [16]:
from typing import List, Tuple
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from process import *
from nn.MLP import MLPNetwork
from coding import Coding
from utils import get_train_data
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
train_df = get_train_data()
train_df['processed_tokens'] = train_df.full_text.apply(tokenize_and_remove_punkt).apply(stem).apply(lem)

Loading from file: tweets_apple.csv


## Make dictionary

In [18]:
coder = Coding()
_ = train_df.processed_tokens.apply(coder.update)

## Encode twitts using dictionary and unify lenghts to num_features

In [19]:
# target number of words of after resizing every tweet 
target_num_words = 20
min_occ = 5

In [20]:
train_df['coded_tokens'] = (
    train_df
    .processed_tokens
    .apply(lambda l: [coder.encode(tok, threshold_min=min_occ) for tok in l])
    .apply(partial(pad_or_truncate, target_len=target_num_words, end=True, pad_value=0))
    .apply(np.array)
)

In [21]:
train_df.sample(n=10)

Unnamed: 0,full_text,score,processed_tokens,coded_tokens
124,Apple Leaks Video of macOS 10.14 Showing Xcode...,3.0,"[appl, leak, video, of, maco, 10, 14, show, xc...","[7, 455, 363, 70, 417, 327, 466, 0, 0, 327, 22..."
188,"RT @Letitbrew1: #MondayMotivation ""Given enoug...",4.0,"[RT, letitbrew1, mondaymotiv, give, enough, co...","[16, 516, 517, 518, 519, 520, 234, 521, 522, 4..."
35,Wondering how to delete photos from your Mac w...,3.0,"[wonder, how, to, delet, photo, from, your, ma...","[0, 352, 57, 0, 0, 37, 80, 127, 414, 0, 0, 105..."
215,#Apple spotlights young #developers ahead of W...,3.0,"[appl, spotlight, young, develop, ahead, of, w...","[7, 898, 899, 95, 900, 70, 31, 9, 10, 11, 0, 9..."
128,Strengthening our global #AI R&amp;D capabilit...,3.0,"[strengthen, our, global, AI, R, amp, D, capab...","[0, 71, 0, 648, 0, 136, 0, 0, 9, 10, 11, 0, 70..."
247,RT @JurassicApps: Dinosaur Assassin: I-Evoluti...,3.0,"[RT, jurassicapp, dinosaur, assassin, I, evolu...","[16, 0, 0, 0, 234, 0, 83, 488, 489, 107, 57, 4..."
80,Hope iOS 12 will be great... few hours left.\r...,2.0,"[hope, io, 12, will, be, great, few, hour, lea...","[0, 91, 92, 211, 58, 621, 0, 0, 0, 123, 7, 117..."
206,Preparation and Evaluation of #Carrot and #App...,3.0,"[prepar, and, evalu, of, carrot, and, appl, bl...","[0, 35, 0, 70, 0, 35, 7, 0, 0, 799, 275, 0, 0,..."
0,MASAKI YODA -Carrying the future- on #Apple mu...,4.0,"[masaki, yoda, carri, the, futur, on, appl, mu...","[1, 2, 0, 4, 0, 6, 7, 8, 9, 10, 11, 0, 8, 13, ..."
67,RT @iOSBetaReleases: The BIG day!! #Apple #WWDC18,5.0,"[RT, iosbetareleas, the, big, day, appl, wwdc18]","[16, 0, 4, 0, 316, 7, 123, 0, 0, 0, 0, 0, 0, 0..."


In [22]:
train_df.coded_tokens.tolist()[5]

array([ 0, 35, 68,  4,  0, 70, 71,  0,  0,  0, 75, 76, 71,  0,  0,  9, 10,
       11,  0, 70])

# Embeddings

In [24]:
from keras.models import Sequential
from keras.layers import Embedding

In [None]:
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)

## Training

In [23]:
from sklearn.model_selection import train_test_split 

In [296]:
X_train, X_test, y_train, y_test = train_test_split(train_df.coded_tokens.tolist(), 
                                                    train_df.score.tolist(), test_size=0.3)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [297]:
def simplify(score):
    """ Reduce number of classes """
    if score < 3.0:
        return 0
    elif score > 3.0:
        return 2
    else: return 1

## Set training parameters
* num_features -> this will mean number of neurons in input layer as well as number of coded tokens in each input tweet passed to network
* num_classes -> number of classes, number of neurons in output layer of network
* num_hidden_neurons -> number of neurons in hidden layer
* num_expamles -> number of examples in dataset

In [7]:
num_features = 30
num_classes = len(train_df.score.unique())
num_hidden_neurons = 400
num_examples = len(train_df)

In [298]:
net = MLPNetwork(num_classes=3,
                num_examples=len(X_train),
                num_features=num_features,
                num_hidden_neurons={1 : 300})

In [299]:
y_train

array([4., 3., 3., 2., 5., 3., 4., 3., 3., 2., 5., 3., 3., 4., 4., 3., 1.,
       3., 4., 1., 3., 3., 4., 5., 3., 5., 2., 2., 2., 3., 3., 3., 1., 3.,
       3., 5., 3., 5., 3., 4., 3., 3., 2., 4., 3., 5., 3., 4., 3., 3., 1.,
       4., 3., 3., 3., 1., 1., 3., 3., 3., 4., 3., 4., 4., 4., 5., 3., 3.,
       3., 4., 4., 3., 3., 5., 5., 3., 2., 3., 3., 3., 3., 4., 4., 3., 5.,
       3., 4., 5., 3., 1., 3., 2., 3., 4., 5., 3., 2., 3., 4., 2., 3., 3.,
       3., 2., 5., 4., 4., 3., 3., 5., 3., 2., 2., 3., 4., 3., 4., 3., 3.,
       5., 3., 4., 3., 1., 5., 3., 4., 4., 1., 3., 4., 2., 4., 3., 4., 3.,
       3., 4., 3., 3., 3., 3., 3., 4., 3., 3., 4., 3., 4., 3., 5., 3., 3.,
       4., 3., 1., 3., 2., 3., 3., 4., 4., 3., 4., 4., 3., 3., 3., 3., 3.,
       3., 4., 3., 1., 5.])

In [300]:
print(X_train.shape)

(175, 30)


In [313]:
# models = net.fit(X_train,y_train, batches=10000, print_loss=True)
mod = MLPClassifier(hidden_layer_sizes=(1000,1000), )

t = mod.fit(X_train, y_train)

In [314]:
good_sklearn = 0
good_my = 0
for i in range(len(y_test)):
    pred_sklearn = t.predict([X_test[i]])
#     pred_my = net.predict(X_train[i])
    if pred_sklearn == y_test[i]:
#         print(True)
        good_sklearn+=1
    if pred_my == y_train[i]:
        good_my += 1
   
# print(float(good_my)/len(y_train))        
print(float(good_sklearn)/len(y_train))

0.21142857142857144


In [173]:
good = 0
for i in range(len(y_test)):
    if net.predict(X_test[i]) == y_test[i]:
        print(True)
        good+=1
    else:
        print(False)
        
print(float(good)/len(y_test))

False
False
True
False
True
False
False
False
True
False
False
False
True
0.3076923076923077


In [126]:
train_df.loc[0]

full_text           MASAKI YODA -Carrying the future- on #Apple mu...
score                                                               4
processed_tokens    [masaki, yoda, carri, the, futur, on, appl, mu...
coded_tokens        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 8, 13,...
Name: 0, dtype: object

In [224]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_iris
import numpy as np
from keras.lay

In [226]:
X,Y = load_iris().data, load_iris().target

mlp = MLPClassifier()
mlp.fit(X, Y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [232]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [230]:
print(mlp.predict(np.array([3.1,  2.5,  8.4,  2.2]).reshape(-1,1)))
      

ValueError: shapes (4,1) and (4,100) not aligned: 1 (dim 1) != 4 (dim 0)

In [None]:
print(mlp.predict_proba([3.1,  2.5,  8.4,  2.2]))

In [228]:

print("sum: %f"%np.sum(mlp.predict_proba([3.1,  2.5,  8.4,  2.2])))

SyntaxError: invalid syntax (<ipython-input-228-2829967ea018>, line 2)