### Wczytanie niezbędnych bibliotek

In [None]:
import re
import time

import numpy as np
import pandas as pd
import nltk
import gensim

import scikitplot as skplt
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from tqdm import tqdm

from pyMorfologik import Morfologik
from pyMorfologik.parsing import ListParser

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import Conv1D, Flatten, Dropout, Dense, LSTM
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.utils import np_utils

%matplotlib inline

### Definicja klasy procesującej dokumenty.

In [None]:
class TokenizedDataFrame():
    __slots__ = ['stopwords', 'stemmed_dict']
    
    def __init__(self, stopwords):
        self.stopwords = stopwords
        self.stemmed_dict = None
        
    
    def prepare_text(self, doc):
        doc = re.sub(r'[^\w\s]','',doc)
        doc = doc.lower()
        doc = nltk.word_tokenize(doc)
        doc = [word for word in doc if word not in self.stopwords]
        return doc
    
    
    def apply_stemming(self, df):
        
        parser = ListParser()
        stemmer = Morfologik()
        
        all_words = list()

        for item in list(df):
            all_words.extend(item)
    
        unique_words = list(set(all_words))
        
        self.stemmed_dict = dict()
        unique_words_stemmer = stemmer.stem(unique_words, parser)

        for item in unique_words_stemmer:
            original = item[0]
            try:
                stemmed = list(item[1])[0]
                self.stemmed_dict[original] = stemmed
            except IndexError:
                self.stemmed_dict[original] = stemmed
        
        
        missings = {item:item for item in unique_words if item not in self.stemmed_dict.keys()}
        self.stemmed_dict.update(missings)
        
        
    @staticmethod
    def remove_empty(df, col):
        return df.loc[df[col].apply(lambda x: len(x)) > 0, :]
    
       
    def transform(self, X, col, **kwargs):
        try:
            X_local = X.copy()
            X_local[col] = X_local[col].apply(self.prepare_text)
            X_local = self.remove_empty(X_local, col)
        except KeyError:
            raise KeyError("{} not present in dataframe".format(col))
            
        if not self.stemmed_dict:
            self.apply_stemming(X_local[col])
            
        X_local[col] = X_local[col].apply(lambda doc: list(map(self.stemmed_dict.get, doc)))
        X_local = X_local.reset_index(drop=True)
        
        return X_local

### Zdefiniowanie stałych

In [None]:
DATA_PATH = './data/learning_set.csv'
STOPWORDS_PATH = 'data/polish_stopwords.csv'
W2V_PATH = 'data/nkjp+wiki-forms-all-100-cbow-hs.txt'

RANDOM_STATE = 23032019

C_V = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

SCORING = 'accuracy'
N_JOBS = -1

### Wczytanie dokumentów

In [None]:
df = pd.read_csv(DATA_PATH, sep = ';')

In [None]:
df.groupby('sentiment').count()/df.shape[0]

In [None]:
stopwords = list(pd.read_csv(STOPWORDS_PATH, engine='python', header=None).iloc[:, 0])


df = TokenizedDataFrame(stopwords).transform(df, 'token')

### Podział na zbiór treningowy/testowy

#### y_train/y_test używamy do sklearn API, dummy do Kerasa

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.token, 
                                                    df.sentiment, 
                                                    test_size=0.15, 
                                                    shuffle=True,
                                                    random_state=RANDOM_STATE)

encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
encoded_y_test = encoder.transform(y_test)
dummy_y_train = np_utils.to_categorical(encoded_y_train)
dummy_y_test = np_utils.to_categorical(encoded_y_test)

In [None]:
model_lr = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', tokenizer=lambda x: x,
                              preprocessor=lambda x: x, ngram_range=(1,2),
                              sublinear_tf=True)),
    ('lr', LogisticRegression(penalty='l2', random_state=RANDOM_STATE,
                              n_jobs=-1, multi_class='multinomial', solver='lbfgs')
                             )
])

model_lr.fit(x_train, y_train)


### Ile wynosi null accuracy?

In [None]:
print("{:.1f}%".format(100*y_train.value_counts().max()/len(y_train)))

### Jakie accuracy na 10-krotnej CV osiąga regresja logistyczna?

In [None]:
cross_validate(model_lr, 
               x_train,
               y_train,
               cv=C_V,
               scoring=SCORING,
               n_jobs=N_JOBS, 
               return_train_score=True)\
.get('test_score')\
.mean()

### Accuracy na zbiorze testowym

In [None]:
accuracy_score(y_test, model_lr.predict(x_test))

### Wykres krzywej uczenia

In [None]:
skplt.estimators.plot_learning_curve(model_lr,
                                     x_train,
                                     y_train,
                                     cv=C_V,
                                    random_state=RANDOM_STATE,
                                    n_jobs=N_JOBS,
                                    scoring=SCORING)
plt.show()

### Wykres słów najbardziej stymulujących skrajny sentyment według regresji

In [None]:
coefficients = model_lr.named_steps.get('lr').coef_
index = coefficients.argsort()

feature_names = np.array(model_lr.named_steps.get('tfidf').get_feature_names())
feature_names_comb = list(feature_names[index][0][:30]) + list(feature_names[index][0][-31::1])

index_comb = list(coefficients[0][index[0][:30]]) + list(coefficients[0][index[0][-31::1]])

plt.figure(figsize=(25,10))
barlist = plt.bar(list(i for i in range(61)), index_comb)
plt.xticks(list(i for i in range(61)),feature_names_comb,rotation=75,size=15)
plt.ylabel('Coefficient magnitude',size=20)
plt.xlabel('Features',size=20)

# color the first smallest 30 bars red
for i in range(30, 61):
    barlist[i].set_color('red')

plt.show()

### Zdefiniowanie funkcji wczytującej embeddingi, klasy procesującej dane do sieci neuronowej i samej sieci.

In [None]:
def init(): # function to load word embedding only once 
    global w2vModel
    w2vModel = load_w2v_embeddings(W2V_PATH)

        
def get_embeddings():
    global w2vModel
    try:
        return w2vModel
    except NameError:
        init()
        return w2vModel


def load_w2v_embeddings(path):
    w2vModel = gensim.models.KeyedVectors.load_word2vec_format(path, 
                                                               binary=False)
    return w2vModel 


def plot_training(history): # plot training history
    plt.figure(figsize=(12, 12))
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    plt.figure(figsize=(12, 12))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

### Jak wczytać dane do challenge'u?

In [None]:
scoring_set = pd.read_csv("./data/challenge_set_warsztat.csv", sep = ';')
scoring_set = TokenizedDataFrame(stopwords).transform(scoring_set, 'token')

### Jak stworzyć i zapisać predykcję?

In [None]:
# dla sklearn API
pred = model_lr.predict(scoring_set.token)

In [None]:
# dla Keras
pred = model.predict(scoring_set.token).argmax(axis=1)-1

In [None]:
assert len(pred) == 5022

### Wektor zapisany przez np.save należy przesłać do nas na e-mail

In [None]:
np.save('nickname', pred)