# Import package 

In [5]:
import pandas as pd

from keras.layers import Input, Dense
from keras.models import Model
from keras import optimizers
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.model_selection import train_test_split

#from nltk import word_tokenize
from nltk.stem.snowball import FrenchStemmer

#import string
import string
import spacy

import numpy as np 
from nltk.tokenize import word_tokenize
from collections import Counter

import time
import numpy as np
from scipy import stats

# Data Import 

In [39]:
# Splite data train and valid
def data_splite (datapath):
    headers = ['polarity', 'headline', 'content']
    df = pd.read_csv(datapath+'/row_data/train.csv', encoding="utf-8",sep=",", names=headers)
    train, valid = train_test_split(df, test_size=0.2)
    pd.DataFrame(train).to_csv(datapath+ '/train.csv', sep=',', encoding='utf-8')
    pd.DataFrame(valid).to_csv(datapath+ '/valid.csv', sep=',', encoding='utf-8')

data_splite('C:/Users/aso/Documents/WPy-376/notebooks/NLP - Python/data')


def load_dataset(filename):
    """ Download the date: list of texts with scores."""
    headers = ['polarity', 'headline', 'content']
    sentences = pd.read_csv(filename, encoding="utf-8",sep=",", names=headers)
    sentences['text'] = sentences['headline'] + sentences['content']
    sentences.drop(columns=[col for col in sentences.columns if col not in ['polarity','text']], inplace=True)
    return sentences

# Data processing

### Class calssifier

In [40]:
#nlp = spacy.load('en')
import en_core_web_sm
nlp = en_core_web_sm.load()

    def __init__(self):
        self.labelset = None
        self.label_binarizer = LabelBinarizer()
        self.model = None
        self.epochs = 22
        self.batchsize = 64
        self.max_features = 8000
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            analyzer="word",
            tokenizer=self.mytokenize,
            stop_words=None,
            ngram_range=(1,2),
            binary=False,
            preprocessor=None
        )

### Class tokenize

In [41]:
def mytokenize(self, text):
    
    # split into words
    tokens = word_tokenize(text)
    
    # convert to lower case
    tokens = [w.lower() for w in tokens]
        
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation[0:6])
    tokens = [w.translate(table) for w in tokens]
    table = str.maketrans('', '', string.punctuation[7:])
    tokens = [w.translate(table) for w in tokens]
        
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # filter out short tokens
    tokens = [w for w in tokens if len(w) > 1]
        
    # filter out specifique tokens
    alpha=set(['the','s'])
    tokens = [w for w in tokens if not w in alpha]
       
    # stemming of words
    porter = FrenchStemmer()
    tokens = [porter.stem(w) for w in tokens]
        
    # define vocab
    vocab = Counter() 
    vocab.update(tokens) 
    # keep tokens with a max occurrence    
    max_occurane = 14
    tokens = [k for k,c in vocab.items() if c <= max_occurane] 

    return tokens
    

    def feature_count(self):
        return len(self.vectorizer.vocabulary_)

### Model creation

In [42]:
       
    def create_model(self):
        input = Input((self.feature_count(),))
        layer = input
        layer = Dense(32, activation='tanh')(layer)
        output = Dense(len(self.labelset), activation='softmax')(layer)
        model = Model(inputs=input, outputs=output)
        model.compile(optimizer=optimizers.Adam(),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        model.summary()
        return model


    def vectorize(self, texts):
        return self.vectorizer.transform(texts).toarray()


    def train_on_data(self, texts, labels, valtexts=None, vallabels=None):
        Y_train = self.label_binarizer.fit_transform(labels)
        self.labelset = set(self.label_binarizer.classes_)
        print("LABELS: %s" % self.labelset)
        self.vectorizer.fit(texts)
        # create a model to train
        self.model = self.create_model()
        # for each text example, build its vector representation
        X_train = self.vectorize(texts)
        #
        my_callbacks = []
        early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto', baseline=None)
        my_callbacks.append(early_stopping)
        if valtexts is not None and vallabels is not None:
            X_val = self.vectorize(valtexts)
            Y_val = self.label_binarizer.transform(vallabels)
            valdata = (X_val, Y_val)
        else:
            valdata = None
        # Train the model!
        self.model.fit(
            X_train, Y_train,
            epochs=self.epochs,
            batch_size=self.batchsize,
            callbacks=my_callbacks,
            validation_data=valdata,
            verbose=2)

    def predict_on_X(self, X):
        return self.model.predict(X)

    def predict_on_data(self, texts):
        X = self.vectorize(texts)
        Y = self.model.predict(X)
        return self.label_binarizer.inverse_transform(Y)

### Class train & predict 

In [43]:
    def train(self, trainfile, valfile=None):
        df = load_dataset(trainfile)
        texts = df['text']
        labels = df['polarity']
        if valfile:
            valdf = load_dataset(valfile)
            valtexts = valdf['text']
            vallabels = valdf['polarity']
        else:
            valtexts = vallabels = None
        self.train_on_data(texts, labels, valtexts, vallabels)


    def predict(self, datafile):
        items = load_dataset(datafile)
        return self.predict_on_data(items['text'])

# Fit model

In [47]:

def set_reproducible():
    # The below is necessary to have reproducible behavior.
    import random as rn
    import os
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(17)
    rn.seed(12345)

def eval_list(glabels, slabels):
    if (len(glabels) != len(slabels)):
        print("\nWARNING: label count in system output (%d) is different from gold label count (%d)\n" % (
        len(slabels), len(glabels)))
    n = min(len(slabels), len(glabels))
    incorrect_count = 0
    for i in range(0, n):
        if slabels[i] != glabels[i]: incorrect_count += 1
    acc = (n - incorrect_count) / n
    acc = acc * 100
    return acc


def train_and_eval_dev_test(trainfile, devfile, testfile, run_id):
    #classifier = Classifier()
    print("\n")
    # Training
    print("RUN: %s" % str(run_id))
    print("  %s.1. Training the classifier..." % str(run_id))
    train(trainfile, devfile)
    print()
    print("  %s.2. Evaluation on the dev dataset..." % str(run_id))
    slabels = predict(devfile)
    glabels = load_dataset(devfile)
    glabels = glabels['polarity']
    devacc = eval_list(glabels, slabels)
    print("       Acc.: %.2f" % devacc)
    testacc = -1
    if testfile is not None:
        # Evaluation on the test data
        print("  %s.3. Evaluation on the test dataset..." % str(run_id))
        slabels = predict(testfile)
        glabels = load_dataset(devfile)
        glabels = glabels['polarity']
        testacc = eval_list(glabels, slabels)
        print("       Acc.: %.2f" % testacc)
    print()
    return (devacc, testacc)


In [48]:
set_reproducible()
#datadir = "../data/"
datadir = "C:/Users/aso/Documents/WPy-376/notebooks/NLP - Python/data/"
trainfile =  datadir + "train.csv"
devfile =  datadir + "valid.csv"
testfile =  datadir + "test.csv"
#testfile = None

In [49]:
# Basic checking
start_time = time.perf_counter()
n = 2
devaccs = []
testaccs = []
for i in range(n):
    res = train_and_eval_dev_test(trainfile, devfile, testfile, i+1)
    devaccs.append(res[0])
    testaccs.append(res[1])
    print('\nCompleted %d runs.' % n)
    print("Dev accs:", devaccs)
    print("Test accs:", testaccs)
    print()
    print("Mean Dev Acc.: %.2f (%.2f)\tMean Test Acc.: %.2f (%.2f)" % (np.mean(devaccs), np.std(devaccs), np.mean(testaccs), np.std(testaccs)))
    print("\nExec time: %.2f s." % (time.perf_counter()-start_time))




RUN: 1
  1.1. Training the classifier...


AttributeError: 'str' object has no attribute 'train_on_data'