# Quick usage guide:
### 1. This is jupyter notebook
### 2. To run active cell press Shift + Enter
### 3. To move between cells use Up / Down arrows

In [None]:
import sys
sys.path.append('..')

In [None]:
from typing import List, Tuple
from functools import partial

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
from keras.utils import to_categorical

from sentiment_analyser.process import *
from sentiment_analyser.MLP import MLPNetwork
from sentiment_analyser.coding import Coding
from sentiment_analyser.utils import get_train_data, simplify

%load_ext autoreload
%autoreload 2

## Import data and preprocess it by applying stemming and lemming

In [None]:
train_df = get_train_data(csv_file="../data/tweets_apple.csv")
train_df['processed_tokens'] = train_df.full_text.apply(tokenize_and_remove_punkt).apply(stem).apply(lem)

## Make dictionary

In [None]:
coder = Coding()
_ = train_df.processed_tokens.apply(coder.update)

## Set unified number of words to pad or truncate each twitt

In [None]:
target_num_words = 20

## Set number of miminal occurrences of word in dict to encode

In [None]:
min_occ = 5

## Create dict based on aceding occurrences value

In [None]:
_ = coder.compile(min_threshold=min_occ)

## Encode and normalize data

In [None]:
train_df['coded_tokens'] = (
    train_df
    .processed_tokens
    .apply(lambda l: [coder.encode_final(tok) for tok in l])
    .apply(partial(pad_or_truncate, target_len=target_num_words, end=True, pad_value=0))
    .apply(np.array)
)

## Get probe of data

In [None]:
train_df.sample(n=10)

# Words embeddings

In [None]:
from keras.models import Sequential
from keras.layers import Embedding

## Define embeddings sizes

In [None]:
embedding_size = 50

In [None]:
model = Sequential()
model.add(Embedding(coder.len_between(threshold_min=min_occ)+1, 
                    embedding_size, 
                    input_length=target_num_words))
model.compile('rmsprop', 'mse')

In [None]:
words_with_embeddings_arr = model.predict(np.array(train_df.coded_tokens.tolist()))
embeddings = [i[0] for i in words_with_embeddings_arr]


# Training

## Decide if reduce number of classes from 5 to 3

In [None]:
reduce = True

In [None]:
if reduce:
    scores = [simplify(s) for s in train_df.score.tolist()]
else:
    scores = train_df.score.tolist()    

## Decide to use embeddings or dictionary ids

In [None]:
embed = False

In [None]:
if embed:
    X = np.array(embeddings)
else:
    X = np.array(train_df.coded_tokens.tolist())

## Apply one-hot encoding on classes

In [None]:
y = to_categorical(scores)

## Split dataset to train and test sets

In [None]:
test_size_percent = 0.15

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=test_size_percent, 
                                                    # ensure same split every run
                                                    random_state=42)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [None]:
print("Number of images in training_set: {}. Features: {}"
      .format(X_train.shape[0], X_train.shape[1]))

In [None]:
print("Number of classes: {}".format(y_train.shape[1]))

## Training parameters
* num_features -> this will mean number of neurons in input layer as well as number of coded tokens in each input tweet passed to network
* num_classes -> number of classes, number of neurons in output layer of network
* num_hidden_neurons -> number of neurons in hidden layer
* num_expamles -> number of examples in dataset

In [None]:
num_features = X_train.shape[1]
num_classes = y_train.shape[1]
num_hidden_neurons = 1500
num_examples = len(X_train)
epochs = 10000

## Declare Multi Layer Perceptron model

In [None]:
net = MLPNetwork(n_classes=num_classes,n_features=num_features,
                   n_hidden_units=num_hidden_neurons, epochs=epochs)

## Fit training data to model

In [None]:
net.fit(X_train, y_train)

## Count model accurrancy on training set

In [None]:
train_acc = net.evaluate(X_train, y_train)
print("Training accurrancy: {}".format(train_acc))

## Count model accurrancy on test set

In [None]:
test_acc = net.evaluate(X_test, y_test)
print("Test accurrancy: {}".format(test_acc))