## Models training helper functions

This notebook contains the functions required to train new models

As it is right now, it reads the file:
```
"Data/Iterative-models-building/Training data/Conventions/training_aggregated_conventions.tsv"
```
and stores it content in a dataframe variable named 'training_df'.

Files in GATHERED_DATA_FOLDER are combined with the content of 'training_df' to create the vocabulary of the Tokenizer, as by design, ```"tokenizer.tokenize()"``` will remove any words not seen on the ```"tokenizer.fit()"``` stage. 




In [1]:
# Required Python utilities
import numpy as np
import pandas as pd

from collections import Counter
import re
from langdetect import detect
from bs4 import BeautifulSoup
from markdown import markdown
from lxml import etree
import os
import random
import tqdm
import itertools 
import pickle

from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

## Deep Learning imports for the classifiers
os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Concatenate
from keras.models import Model

##Â ML required imports (for clustering)
from sklearn import metrics
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import scale, StandardScaler
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB

# Topic modeling imports
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

##Â NLP related imports
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer


# visualization imports
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline
sns.set() 


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
imported_notebook = True ## Set this flag to true if importing this code from another notebook

In [3]:
GLOVE_DIR = "Data/Iterative-models-building/Training data/"
GATHERED_DATA_FOLDER = "Data/Iterative-models-building/Gathered_data/Conventions/"
MODELS_DIR = "./Data/Iterative-models-building/Models/"

In [43]:
## Classificaiton NETWORKs Configuration parameters
MAX_SEQUENCE_LENGTH = 40
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 100 ## 100, 200 or 300
VALIDATION_SPLIT = 0.2

TFIDF_MAX_FEATURES=10000

NUM_EPOCHS = 20

In [5]:
## Vocabulary loading
def read_glove_embeddings():
    ## Reading GLOVE (precalculated word embeddings)
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(EMBEDDING_DIM)))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    return embeddings_index

embeddings_index = read_glove_embeddings()

In [6]:
## Stop words list 
def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)
#load a set of stop words
stopwords=get_stop_words("Data/Iterative-models-building/Training data/resources/stopwords.txt")

In [7]:
## Text cleaning functions
def clean_line(X):
    
    stemmer = WordNetLemmatizer()

    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document2 = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document2)

    return document

def clean_str(text):
    documents = []
    ret = documents
    if type(text == "list"):
        #print("list: ", text)
        for X in text:
            documents.append(clean_line(X))
    
        ret =""
        if(len(documents)>1):

            for d in documents:
                ret+=d+"\n"
        else:
            ret = documents[0]
        
    elif type(text)=="str":
        p#rint("str")
        ret = clear_line(text)
    return ret


In [8]:
## Helper function to create a tokenizer given a data frame with a column "text"
def create_tokenizer(df, max_words=MAX_NB_WORDS):
    texts = df['text'].values
    
    _tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    _tokenizer.fit_on_texts(texts)
    
    return _tokenizer 

In [9]:
## Calculate AUC for validation when training the model
## As explained here: https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
class roc_callback(Callback):
    def __init__(self,validation_data):#training_data,validation_data):
        #self.x = training_data[0]
        #self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
       
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)

        print("Roc-AUC on validation: {}".format(str(round(roc_val,4))))
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [10]:
## Training a deep learning model given training, validation data
def train_DL_model(x_train, y_train, x_val, y_val, tokenizer, num_epochs=NUM_EPOCHS):
    
    word_index = tokenizer.word_index
    
    print('\nNumber of elements from each class in traing and validation set ')
    print(y_train.sum(axis=0))
    print(y_val.sum(axis=0))


    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    

    convs = []
    filter_sizes = [3,4,5]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    for fsz in filter_sizes:
        l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)#

    l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
    l_pool1 = MaxPooling1D(5)(l_cov1)

    l_flat = Flatten()(l_pool1)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(2, activation='softmax')(l_dense)
    
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    
    print("model fitting - more complex convolutional neural network")
    model.summary()
    train_history = model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=num_epochs, batch_size=50, callbacks=[roc_callback(validation_data=(x_val, y_val))])

    return model,tokenizer,x_val,y_val, train_history


In [11]:
## Trains a DL model without using validation dataset
def train_DL_model_not_validation(x_train, y_train, tokenizer, num_epochs=NUM_EPOCHS):
    
    word_index = tokenizer.word_index
    
    print('\nNumber of elements from each class in traing and validation set ')
    print(y_train.sum(axis=0))
    print(y_val.sum(axis=0))

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)


    convs = []
    filter_sizes = [3,4,5]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    for fsz in filter_sizes:
        l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)#

    l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
    l_pool1 = MaxPooling1D(5)(l_cov1)

    l_flat = Flatten()(l_pool1)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(2, activation='softmax')(l_dense)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    print("model fitting - more complex convolutional neural network")
    model.summary()
    train_history = model.fit(x_train, y_train,epochs=num_epochs, batch_size=50)
    return model,tokenizer,train_history


In [12]:
## Trains one DL model for each group of sentences (within each convention)
def train_DL_models(df_train,
                    data_class_column="convention", 
                    data_label_column="label",
                   df_val=None,
                    tokenizer=None,
                   random_seed=None,
                   use_validation=True, num_epochs = NUM_EPOCHS):
    
    
    _models = {}
    _tokenizers = {}
    _data_val_x = {}
    _data_val_y = {}
    _train_histories = {}
    if tokenizer is None:        
        if df_val is None:
            tokenizer = create_tokenizer(df_train)
        else:
            tokenizer = create_tokenizer(pd.concat([df_train, df_val]))

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    
    for convention in df[data_class_column].unique():
        
        print("----------------------------------------------------------------")
        print("            {}                  ".format(convention))
        print("----------------------------------------------------------------")
        
        tmp_df_train = df_train[df_train[data_class_column] == convention]
        if use_validation:
            if df_val is None:

                texts = tmp_df_train['text'].values
                labels = tmp_df_train[data_label_column].values

                sequences = tokenizer.texts_to_sequences(texts)


                data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

                labels = to_categorical(np.asarray(labels))
                print('Shape of data tensor:', data.shape)
                print('Shape of label tensor:', labels.shape)

                indices = np.arange(data.shape[0])
                if random_seed is not None:
                    np.random.seed(random_seed)
                np.random.shuffle(indices)
                data = data[indices]
                labels = labels[indices]
                nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])



                x_train = data[:-nb_validation_samples]
                y_train = labels[:-nb_validation_samples]
                x_val = data[-nb_validation_samples:]
                y_val = labels[-nb_validation_samples:]


                _model, _tokenizer, _x_val, _y_val, _train_h = train_DL_model(x_train, y_train, x_val, y_val, tokenizer, num_epochs=num_epochs)
            else:

                tmp_df_val = df_val[df_val[data_class_column] == convention]

                train_texts = tmp_df_train['text'].values
                train_labels = tmp_df_train[data_label_column].values

                val_texts = tmp_df_val['text'].values
                val_labels = tmp_df_val[data_label_column].values

                train_sequences = tokenizer.texts_to_sequences(train_texts)
                val_sequences = tokenizer.texts_to_sequences(val_texts)



                x_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
                x_val = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

                y_train = to_categorical(np.asarray(train_labels))
                y_val = to_categorical(np.asarray(val_labels))

                _model, _tokenizer, _x_val, _y_val, _train_h = train_DL_model(x_train, y_train, x_val, y_val, tokenizer, num_epochs=num_epochs)
        
            _models[convention] = _model
            _tokenizers[convention] = _tokenizer
            _data_val_x[convention] = _x_val
            _data_val_y[convention] = _y_val
            _train_histories[convention] = _train_h
        else:
            texts = tmp_df_train['text'].values
            labels = tmp_df_train[data_label_column].values

            sequences = tokenizer.texts_to_sequences(texts)


            data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

            labels = to_categorical(np.asarray(labels))
            print('Shape of data tensor:', data.shape)
            print('Shape of label tensor:', labels.shape)

            
            x_train = data
            y_train = labels
        
            _model, _tokenizer, _train_h = train_DL_model_not_validation(x_train, y_train, tokenizer, num_epochs=num_epochs)

            _models[convention] = _model
            _tokenizers[convention] = _tokenizer
            _train_histories[convention] = _train_h
        

        print("\n\n\n")
        
    return (_models, _tokenizers, _data_val_x, _data_val_y, _train_histories)

In [13]:
def store_DL_models_in_picke(pickle_f_name, _models, _tokenizers, _val_x, _val_y, _train_histories):
    _convnet_items = {}
    
    _convnet_items['model'] = _conventions_models
    _convnet_items['tokenizer'] = _conventions_tokenizers
    _convnet_items['_x_val'] = _conventions_data_val_x
    _convnet_items['_y_val'] = _conventions_data_val_y
    _convnet_items['train_history'] = _conventions_train_histories
    
    with open(os.path.join(MODELS_DIR, ), 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(_convnet_items, f, pickle.HIGHEST_PROTOCOL)
        

In [14]:
def read_DL_models_from_pickle(pickle_f_name):
    f = open( os.path.join(MODELS_DIR, pickle_f_name), 'rb') 
    _convnet_items = pickle.load(f)
    f.close()
    
    ## Load the models from the downloaded pickle file             
    _models = _convnet_items['model'] 
    _tokenizers = _convnet_items['tokenizer'] 
    _val_x = _convnet_items['_x_val'] 
    _val_y = _convnet_items['_y_val'] 
    _train_histories = _convnet_items['train_history'] 
    
    return _models, _tokenizers, _val_x, _val_y, _train_histories

#_models, _tokenizers, _val_x, _val_y, _train_histories = read_DL_models_from_pickle('conv_models_items.pickle')

In [16]:
## Trains a TF-IDF, Naive-Bayes based classifier
def train_new_text_pipelineNB(texts, labels1):
    text_clf = Pipeline([
        ('vect', CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=TFIDF_MAX_FEATURES)),
        ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
        ('clf', MultinomialNB()),
    ])
    
    text_clf.fit(texts, labels1)  
    
    return text_clf

In [45]:
def train_ML_models(df,
                    data_class_column="convention", 
                    data_label_column="label"):
    _models = {}  
    
    for c in df[data_class_column].unique():
        sentences = df[df[data_class_column] == c]['text'].values
        labels = df[df[data_class_column] == c][data_label_column].values
        
        m = train_new_text_pipelineNB(sentences, labels)
        
        _models[c] = m
    return _models

In [44]:
## Helpers for calculating ML model probability score of class 1 for a set of sentences
def calculate_matches_DL(sentences, _models, _tokenizers):
    _models_matches = {}

    ## Getting classification confidence per model for each repo
    for model_key in _models.keys():
            
        tokenized_sentences = _tokenizers[model_key].texts_to_sequences(sentences)
        
        tokenized_sentences = pad_sequences(tokenized_sentences, maxlen=MAX_SEQUENCE_LENGTH)
        preds = _models[model_key].predict(tokenized_sentences)
    
        _models_matches[model_key] = preds[:,1]
        
    return _models_matches

def calculate_matches_ML(_sentences, _models): 
    
    _models_matches = {}

    ## Getting classification confidence per model for each repo
    for model_key in _models.keys():
        
        preds = _models[model_key].predict_proba(_sentences)[:,1]
        _models_matches[model_key] = preds

    return _models_matches


## Calculates probability of sentence combining DL models and ML models predictions
def calculate_matches_mixture(_sentences, _modelsML, _modelsDL, _tokenizersDL):
    _repos_matches = {}
    preds_ML = calculate_matches_ML(_sentences, _modelsML)
    preds_DL = calculate_matches_DL(_sentences, _modelsDL, _tokenizersDL)

    ## Getting classification confidence per model for each repo
    for model_key in _modelsML.keys():
            
        preds1 = preds_ML[model_key]
        preds2 = preds_DL[model_key]

        _repos_matches[model_key] = preds1+preds2
    return _repos_matches

In [41]:
def remove_line_breaks(text_str):
        text_str = text_str.replace("\\n", " ")
        return text_str
    
def read_training_data():
    training_df = pd.read_csv("Data/Iterative-models-building/Training data/Conventions/training_aggregated_conventions.tsv", sep="\t")

    training_df['text'] = training_df['text'].apply(remove_line_breaks)
    return training_df

training_df = read_training_data()




In [20]:
gathered_data_files = [f for f in os.listdir(GATHERED_DATA_FOLDER) 
                              if (os.path.isfile(os.path.join(GATHERED_DATA_FOLDER, f)) 
                                  and not f.startswith( '.' ) 
                                  and "gathered_" in f)]
gathered_dfs = []

for f_name in gathered_data_files:
    tmp_df = pd.read_csv(os.path.join(GATHERED_DATA_FOLDER, f_name), sep="\t")
    tmp_df = tmp_df.rename(columns={"sentence": "text"})
    
    gathered_dfs.append(tmp_df)


gathered_dfs.append(training_df)##Adding training data sentences

extended_tokenizer = create_tokenizer(pd.concat(gathered_dfs))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  app.launch_new_instance()
