In [8]:
from typing import List, Tuple
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from process import *
from nn.MLP import MLPNetwork
from coding import Coding
from utils import get_train_data
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
train_df = get_train_data()
train_df['processed_tokens'] = train_df.full_text.apply(tokenize_and_remove_punkt).apply(stem).apply(lem)

Loading from file: tweets_apple.csv


## Make dictionary

In [10]:
coder = Coding()
_ = train_df.processed_tokens.apply(coder.update)

## Encode twitts using dictionary and unify lenghts to num_features

In [11]:
# target number of words of after resizing every tweet 
target_num_words = 20


In [12]:
coder.occurrences

Counter({'masaki': 11,
         'yoda': 11,
         'carri': 2,
         'the': 111,
         'futur': 3,
         'on': 42,
         'appl': 291,
         'music': 31,
         'http': 239,
         't': 247,
         'co': 237,
         'vdlefwyuul': 1,
         'listen': 13,
         'japan': 11,
         'inform': 12,
         'RT': 71,
         'johnybeextrem': 1,
         'We': 6,
         're': 5,
         'bloominthepark': 3,
         'today': 26,
         'with': 20,
         'bigbagjelli': 2,
         'we': 15,
         'have': 23,
         'tasti': 2,
         'treat': 2,
         'avail': 3,
         'bloom2018': 2,
         'bloomin': 1,
         'wwdc': 54,
         '2018': 26,
         'liveblog': 1,
         'new': 26,
         'and': 61,
         'updat': 8,
         'from': 16,
         's': 57,
         'keynot': 7,
         '5hp87khsho': 1,
         'wwdc2018': 24,
         'gear': 3,
         'sweet': 1,
         'novelti': 1,
         'johnybe': 1,
         'xtre

In [5]:
train_df['coded_tokens'] = (
    train_df
    .processed_tokens
    .apply(lambda l: [coder.encode(tok) for tok in l])
    .apply(partial(pad_or_truncate, target_len=num_features, end=True, pad_value=0))
    .apply(np.array)
)

NameError: name 'num_features' is not defined

In [99]:
train_df.sample(n=10)

Unnamed: 0,full_text,score,processed_tokens,coded_tokens
71,RT @the_best_offer: 17% OFF #sale #save #apple...,3.0,"[RT, the_best_off, 17, off, sale, save, appl, ...","[16, 624, 625, 626, 627, 628, 7, 348, 214, 500..."
103,***10% Off June iPhone 8 Repairs***\r\n\r\nWe ...,4.0,"[10, off, june, iphon, 8, repair, We, be, expe...","[327, 626, 595, 83, 140, 408, 18, 58, 821, 105..."
99,Heading to the Apple Worldwide Developers Conf...,3.0,"[head, to, the, appl, worldwid, develop, confe...","[789, 57, 4, 7, 94, 95, 96, 332, 264, 790, 57,..."
199,RT @AR72014: #wallpaper #homescreen #lockscree...,3.0,"[RT, ar72014, wallpap, homescreen, lockscreen,...","[16, 1237, 1238, 1239, 1240, 91, 7, 41, 123, 2..."
150,MASAKI YODA -thought- on #Apple music https://...,3.0,"[masaki, yoda, think, on, appl, music, http, t...","[1, 2, 618, 6, 7, 8, 9, 10, 11, 1077, 8, 13, 1..."
175,New #MacBookPro with 6 cores &amp; 32GB RAM to...,3.0,"[new, macbookpro, with, 6, core, amp, 32gb, ra...","[34, 201, 22, 134, 199, 136, 202, 204, 57, 312..."
53,iOS 11.4 is already rolling out with iCloud Me...,4.0,"[io, 11, 4, be, alreadi, roll, out, with, iclo...","[91, 138, 129, 58, 235, 403, 264, 22, 82, 545,..."
2,WWDC 2018 Liveblog: News and Updates From Appl...,3.0,"[wwdc, 2018, liveblog, new, and, updat, from, ...","[31, 32, 33, 34, 35, 36, 37, 7, 38, 39, 9, 10,..."
86,"After @CambridgeAnalytica, #Facebook said it h...",2.0,"[after, cambridgeanalytica, facebook, say, it,...","[715, 716, 146, 547, 87, 25, 717, 718, 232, 62..."
36,Today is the day!! Join us this evening to wat...,4.0,"[today, be, the, day, join, us, thi, even, to,...","[21, 58, 4, 316, 317, 318, 220, 319, 57, 241, ..."


In [106]:
train_df.coded_tokens.tolist()[5]

array([67, 35, 68,  4, 69, 70, 71, 72, 73, 74, 75, 76, 71, 77, 78,  9, 10,
       11, 79, 70, 80, 81,  7, 82, 83, 84,  0,  0,  0,  0])

## Training

In [276]:
from sklearn.model_selection import train_test_split 

In [296]:
X_train, X_test, y_train, y_test = train_test_split(train_df.coded_tokens.tolist(), 
                                                    train_df.score.tolist(), test_size=0.3)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [297]:
def simplify(score):
    """ Reduce number of classes """
    if score < 3.0:
        return 0
    elif score > 3.0:
        return 2
    else: return 1

## Set training parameters
* num_features -> this will mean number of neurons in input layer as well as number of coded tokens in each input tweet passed to network
* num_classes -> number of classes, number of neurons in output layer of network
* num_hidden_neurons -> number of neurons in hidden layer
* num_expamles -> number of examples in dataset

In [7]:
num_features = 30
num_classes = len(train_df.score.unique())
num_hidden_neurons = 400
num_examples = len(train_df)

In [298]:
net = MLPNetwork(num_classes=3,
                num_examples=len(X_train),
                num_features=num_features,
                num_hidden_neurons={1 : 300})

In [299]:
y_train

array([4., 3., 3., 2., 5., 3., 4., 3., 3., 2., 5., 3., 3., 4., 4., 3., 1.,
       3., 4., 1., 3., 3., 4., 5., 3., 5., 2., 2., 2., 3., 3., 3., 1., 3.,
       3., 5., 3., 5., 3., 4., 3., 3., 2., 4., 3., 5., 3., 4., 3., 3., 1.,
       4., 3., 3., 3., 1., 1., 3., 3., 3., 4., 3., 4., 4., 4., 5., 3., 3.,
       3., 4., 4., 3., 3., 5., 5., 3., 2., 3., 3., 3., 3., 4., 4., 3., 5.,
       3., 4., 5., 3., 1., 3., 2., 3., 4., 5., 3., 2., 3., 4., 2., 3., 3.,
       3., 2., 5., 4., 4., 3., 3., 5., 3., 2., 2., 3., 4., 3., 4., 3., 3.,
       5., 3., 4., 3., 1., 5., 3., 4., 4., 1., 3., 4., 2., 4., 3., 4., 3.,
       3., 4., 3., 3., 3., 3., 3., 4., 3., 3., 4., 3., 4., 3., 5., 3., 3.,
       4., 3., 1., 3., 2., 3., 3., 4., 4., 3., 4., 4., 3., 3., 3., 3., 3.,
       3., 4., 3., 1., 5.])

In [300]:
print(X_train.shape)

(175, 30)


In [313]:
# models = net.fit(X_train,y_train, batches=10000, print_loss=True)
mod = MLPClassifier(hidden_layer_sizes=(1000,1000), )

t = mod.fit(X_train, y_train)

In [314]:
good_sklearn = 0
good_my = 0
for i in range(len(y_test)):
    pred_sklearn = t.predict([X_test[i]])
#     pred_my = net.predict(X_train[i])
    if pred_sklearn == y_test[i]:
#         print(True)
        good_sklearn+=1
    if pred_my == y_train[i]:
        good_my += 1
   
# print(float(good_my)/len(y_train))        
print(float(good_sklearn)/len(y_train))

0.21142857142857144


In [173]:
good = 0
for i in range(len(y_test)):
    if net.predict(X_test[i]) == y_test[i]:
        print(True)
        good+=1
    else:
        print(False)
        
print(float(good)/len(y_test))

False
False
True
False
True
False
False
False
True
False
False
False
True
0.3076923076923077


In [126]:
train_df.loc[0]

full_text           MASAKI YODA -Carrying the future- on #Apple mu...
score                                                               4
processed_tokens    [masaki, yoda, carri, the, futur, on, appl, mu...
coded_tokens        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 8, 13,...
Name: 0, dtype: object

In [224]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_iris
import numpy as np
from keras.lay

In [226]:
X,Y = load_iris().data, load_iris().target

mlp = MLPClassifier()
mlp.fit(X, Y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [232]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [230]:
print(mlp.predict(np.array([3.1,  2.5,  8.4,  2.2]).reshape(-1,1)))
      

ValueError: shapes (4,1) and (4,100) not aligned: 1 (dim 1) != 4 (dim 0)

In [None]:
print(mlp.predict_proba([3.1,  2.5,  8.4,  2.2]))

In [228]:

print("sum: %f"%np.sum(mlp.predict_proba([3.1,  2.5,  8.4,  2.2])))

SyntaxError: invalid syntax (<ipython-input-228-2829967ea018>, line 2)