In [2]:
import os
import re
import random
import glob

import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize


## Deep Learning imports for the classifiers
os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Concatenate
from keras.models import Model

##Â ML required imports (for clustering)
from sklearn import metrics
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import scale, StandardScaler
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer


# visualization imports
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline
sns.set() 

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [17]:
FLAG__AGGREGATE_CONVENTIONS_SENTENCES_FROM_GITHUB = True
FLAG__GENERATE_SENTECE_TOKENIZED_DOCS=True
FLAG__CREATE_KEYWORDS = True
FLAG__ON_BINDER = False


box_gstd = widgets.Checkbox(True, description='FLAG__GENERATE_SENTECE_TOKENIZED_DOCS')
box_acsfg = widgets.Checkbox(True, description='FLAG__AGGREGATE_CONVENTIONS_SENTENCES_FROM_GITHUB')
box_binder = widgets.Checkbox(False, description='Running on binder')

def update_box_gstd(change):
    global FLAG__GENERATE_SENTECE_TOKENIZED_DOCS
    FLAG__GENERATE_SENTECE_TOKENIZED_DOCS= change['new']
box_gstd.observe(update_box_gstd, 'value')

def update_box_acsfg(change):
    global FLAG__AGGREGATE_CONVENTIONS_SENTENCES_FROM_GITHUB
    FLAG__AGGREGATE_CONVENTIONS_SENTENCES_FROM_GITHUB = change['new']

box_acsfg.observe(update_box_acsfg, 'value')

def update_box_binder(change):
    global FLAG__ON_BINDER
    FLAG__ON_BINDER = change['new']

box_binder.observe(update_box_binder, 'value')

#display(box_acsfg, box_gstd, filePath1, filePath2, filePath3)
display(box_acsfg, box_gstd, box_binder)

Checkbox(value=True, description='FLAG__AGGREGATE_CONVENTIONS_SENTENCES_FROM_GITHUB')

Checkbox(value=True, description='FLAG__GENERATE_SENTECE_TOKENIZED_DOCS')

Checkbox(value=False, description='Running on binder')

In [18]:
if FLAG__ON_BINDER == True:
    ROOT_DIR = "/home/jovyan/cp_wssc"
else:
    ROOT_DIR = "."
    
print (ROOT_DIR)

.


In [19]:
BASE_DIR_SCI_ARTICLES = ROOT_DIR + "/Data/VALIDATED_DATA/Conventions/scientific_articles/"
BASE_DIR_GIT_HUB =  ROOT_DIR + "/Data/VALIDATED_DATA/Conventions/github"
BASE_DIR_AGGREGATED =  ROOT_DIR + "/Data/VALIDATED_DATA/Conventions/aggregated/"
KEYWORDS_DIR =  ROOT_DIR + "/Data/VALIDATED_DATA/Conventions/keywords/"


if not os.path.exists(BASE_DIR_SCI_ARTICLES):
    os.makedirs(BASE_DIR_SCI_ARTICLES)

if not os.path.exists(BASE_DIR_GIT_HUB):
    os.makedirs(BASE_DIR_GIT_HUB)

if not os.path.exists(BASE_DIR_AGGREGATED):
    os.makedirs(BASE_DIR_AGGREGATED)
    
if not os.path.exists(KEYWORDS_DIR):
    os.makedirs(KEYWORDS_DIR)
    
    
NUM_KEYWORDS_PER_CONV = 30

In [5]:
#obtain list of files in alphabetic order
justif_files = sorted([f for f in os.listdir(BASE_DIR_SCI_ARTICLES) if (os.path.isfile(os.path.join(BASE_DIR_SCI_ARTICLES, f)) and not f.startswith( '.' ) and not "random" in f and not "training_" in f and not "aggregated_" in f and not "splitted_" in f)])

In [6]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    text = text.replace("b'", "")
    
    ## Removing strings such as \\xe5 \\xe6 \\xe7 that appear a lot in the descriptions
    text = re.sub(r':?\\+x\w{2}', ' ', text, flags=re.MULTILINE)
     #text = text.replace('+',' ')
    text = re.sub('-', ' ', text, flags=re.MULTILINE) # Added by Aideen
    text = re.sub(' +', ' ', text, flags=re.MULTILINE) # Added by Aideen
    text = text.replace('+',' ')
    
    return text

In [7]:
#obtain list of files in alphabetic order
conv_files = sorted([f for f in os.listdir(BASE_DIR_GIT_HUB) if (os.path.isfile(os.path.join(BASE_DIR_GIT_HUB, f)) and not f.startswith( '.' ) and f.endswith( '.txt' ) and not "random" in f and not "training_" in f)])

In [8]:
print(BASE_DIR_SCI_ARTICLES)
display(justif_files)
print ("")
print(BASE_DIR_GIT_HUB)
display (conv_files)

./DATA/VALIDATED_DATA/Conventions/scientific_articles/


['Civic.txt',
 'Domestic.txt',
 'Green.txt',
 'Industrial.txt',
 'Inspired.txt',
 'Market.txt',
 'Project.txt',
 'Renown.txt']


./DATA/VALIDATED_DATA/Conventions/github


['Civic.txt',
 'Domestic.txt',
 'Green.txt',
 'Industrial.txt',
 'Inspired.txt',
 'Market.txt',
 'Project.txt',
 'Renown.txt']

# Agregate Scientific Article sentences with  conventions sentences

In [9]:
if FLAG__AGGREGATE_CONVENTIONS_SENTENCES_FROM_GITHUB:
    docs = []
      #iterate through each file name in BASE_DIR_SCI_ARTICLES and BASE_DIR_GITHUB 
      #create a new file aggregated_{convention name} which 
      #contains aggregated content already tokenized into sentences using sent_tokenize
    for i,x in enumerate(justif_files):
        texts = []
        source =  []
        convs = []
        
        
        print("Parsing ", x)
        with open(os.path.join(BASE_DIR_GIT_HUB, x), 'rb') as f2:
            data2 = str(f2.read()).replace("\\n", " ")
            data2 = re.sub(' +', ' ', data2)
            docs.append(pre_process(data2)) 
            tokenised_into_sentences = sent_tokenize(data2)
            print ("number of sentences from github: ", len(tokenised_into_sentences))
            for s in tokenised_into_sentences:
                texts.append(pre_process(s))
                source.append ("github")
                convs.append(x.replace(".txt", ""))
            f2.close() 
            
        
        with open(os.path.join(BASE_DIR_SCI_ARTICLES, x), 'rb') as f:
             #data = str(f.read())
            data = str(f.read()).replace("\\n", " ")
            data = re.sub(' +', ' ', data)
            #replace docs[i] with docs[i]+ the new data
            docs.append(docs.pop(i)+pre_process(data)) 
            tokenised_into_sentences = sent_tokenize(data)
            print ("number of sentences from scientific_articles ", len(tokenised_into_sentences))
            for s in tokenised_into_sentences:
                texts.append(pre_process(s))
                source.append ("justification")
                convs.append(x.replace(".txt", ""))
            f.close() 
        

        with open(os.path.join(
                                BASE_DIR_AGGREGATED, 
                                'aggregated_{}'.format(x.replace(".txt", ""))+".tsv"),
                                'w+') as f3: 
            f3.write("{}\t{}\t{}\n".format("sentence", "provenance", "convention"))
            for text, source, conv in zip (texts, source, convs):
                #f3.write(t.replace("\\n", "")+ '\n')
                sentence = text.replace("\\n", "").replace('\t', ' ')  # + '\n'
                f3.write("{}\t{}\t{}\n".format(sentence, source, conv.lower()))
            f3.close()
            
    #read newly created aggregated files into list sorted alphabetically      
    aggregated_files = sorted([f for f in os.listdir(BASE_DIR_AGGREGATED) if (os.path.isfile(os.path.join(BASE_DIR_AGGREGATED, f)) and not f.startswith( '.' ) and not "random" in f and not "training_" in f and "aggregated_" in f)])
    print("")
    print("New aggregated files created in folder ", BASE_DIR_AGGREGATED, ": " )
    display(aggregated_files)


Parsing  Civic.txt
number of sentences from github:  47
number of sentences from scientific_articles  174
Parsing  Domestic.txt
number of sentences from github:  20
number of sentences from scientific_articles  185
Parsing  Green.txt
number of sentences from github:  1
number of sentences from scientific_articles  85
Parsing  Industrial.txt
number of sentences from github:  309
number of sentences from scientific_articles  153
Parsing  Inspired.txt
number of sentences from github:  16
number of sentences from scientific_articles  113
Parsing  Market.txt
number of sentences from github:  100
number of sentences from scientific_articles  179
Parsing  Project.txt
number of sentences from github:  54
number of sentences from scientific_articles  300
Parsing  Renown.txt
number of sentences from github:  8
number of sentences from scientific_articles  145

New aggregated files created in folder  ./DATA/VALIDATED_DATA/Conventions/aggregated/ : 


['aggregated_Civic.tsv',
 'aggregated_Domestic.tsv',
 'aggregated_Green.tsv',
 'aggregated_Industrial.tsv',
 'aggregated_Inspired.tsv',
 'aggregated_Market.tsv',
 'aggregated_Project.tsv',
 'aggregated_Renown.tsv']

In [10]:
convention_sentences_df = pd.concat([pd.read_csv(f, sep='\t') for f in glob.glob(BASE_DIR_AGGREGATED+'*.tsv')], ignore_index = True)

display(convention_sentences_df.head(5))
display(convention_sentences_df.tail(5))

Unnamed: 0,sentence,provenance,convention
0,in view of the issues with the swagger codegen...,github,civic
1,this is an easter egg concerned with the com...,github,civic
2,we would like to take this opportunity to than...,github,civic
3,"and of course, we couldn\'t do this without [o...",github,civic
4,"remember that this is a community project, pe...",github,civic


Unnamed: 0,sentence,provenance,convention
1878,retreat outside dreams leads to a fall.,justification,inspired
1879,beings in the state of unworthiness are define...,justification,inspired
1880,they are also qualified by properties that exp...,justification,inspired
1881,knowledge acquired through education the routi...,justification,inspired
1882,the polity comes undone when the temptation to...,justification,inspired


In [11]:
document_lengths = np.array(list(map(len, convention_sentences_df.sentence.str.split(' '))))

print("The average number of words in a document is: {}.".format(np.mean(document_lengths)))
print("The minimum number of words in a document is: {}.".format(min(document_lengths)))
print("The maximum number of words in a document is: {}.".format(max(document_lengths)))

The average number of words in a document is: 28.66171003717472.
The minimum number of words in a document is: 1.
The maximum number of words in a document is: 269.


# USE TF_IDF to create keywords per convention
TF-IDF stands for “Term Frequency — Inverse Data Frequency”. 

Term Frequency (tf): gives us the frequency of the word in each document in the corpus. It is the ratio of number of times the word appears in a document compared to the total number of words in that document. It increases as the number of occurrences of that word within the document increases. Each document has its own tf.

Inverse Data Frequency (idf): used to calculate the weight of rare words across all documents in the corpus. The words that occur rarely in the corpus have a high IDF score. 

For equations see:
https://www.freecodecamp.org/news/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3/

In [12]:
cv=CountVectorizer(max_df=0.85,max_features=10000)
word_count_vector=cv.fit_transform(docs)
list(cv.vocabulary_.keys())[:10]

['view',
 'issues',
 'swagger',
 'codegen',
 'beta',
 'release',
 'disagreement',
 'direction',
 '40',
 'top']

In [13]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [14]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
 

In [15]:
# you only needs to do this once, this is a mapping of index to 
feature_names=cv.get_feature_names()

keywords_for_df = []
convention_for_df = []
for i,f in enumerate(aggregated_files):
    # get the document that we want to extract keywords from
    doc = docs[i]
    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 30
    keywords=extract_topn_from_vector(feature_names,sorted_items,NUM_KEYWORDS_PER_CONV)

    # now add the result to dataframe
    for k in keywords:
        convention_for_df.append(f.replace('aggregated_', '').replace('.tsv','').lower())
        keywords_for_df.append(k)

zippedList =  list(zip(keywords_for_df, convention_for_df))
keywords_df = pd.DataFrame(zippedList, columns = ['keyword' , 'convention'])
display (keywords_df.head(20))

Unnamed: 0,keyword,convention
0,community,civic
1,collective,civic
2,civic,civic
3,solidarity,civic
4,interests,civic
5,union,civic
6,workers,civic
7,representative,civic
8,chapter,civic
9,equality,civic


# Store keywords as CSV file 

In [16]:
path = KEYWORDS_DIR+'keywords.csv'
display ("storing keywords.csv at path:", path)
keywords_df.to_csv(path,index = None, header=True)

'storing keywords.csv at path:'

'./DATA/VALIDATED_DATA/Conventions/keywords/keywords.csv'

In [None]:
keywords=extract_topn_from_vector(feature_names,sorted_items,15)

# STOP HERE (REVIEW REST OF CODE TO CREATE ONE DATA SOURCE )

## Training models on justification text

In [None]:
## Deep Learning models config
## Classificaiton NETWORKs Configuration parameters
MAX_SEQUENCE_LENGTH = 32
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 100 ## 100, 200 or 300
VALIDATION_SPLIT = 0.2

NUM_EPOCHS = 20

In [None]:
## Reading GLOVE (precalculated word embeddings)

GLOVE_DIR = "/Users/aideenf/Documents/GitHub/Economy_of_Conventions/glove.6B/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(EMBEDDING_DIM)))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)

In [None]:
#Apply one hot encoding to column "convention"

display (df.head(5))
print("Applying one hot encoding to convention")
df["convention"] = pd.Categorical(df["convention"])
dfDummies = pd.get_dummies(df["convention"], prefix="convention")
display (dfDummies.head(5))
df_final = pd.concat([df, dfDummies], axis=1)
#df_final = df.drop(value, axis=1)
display (df_final.head(5))


#Generate the data frames for each model

#df_by_convention = df[df.convention == 'Green']
#display (df_by_convention.sample(n = 3))



In [None]:
## This code generates a training file for each characteristic by 
## concetanting sentences for convention X annotated with label one 
## and sentences from all the other convention that are annotated with label 0.



##It generates a training_{convention_name}.csv file for each convention
def generate_training_files(data_dir, data_files):
    for file_name in data_files:

        f = open(os.path.join(data_dir,file_name), "r")
        original_lines = f.readlines()
        
        original_lines2=[]
        for l in original_lines:
                for l2 in l.split("."):
                    original_lines2.append(l2)
        f.close()


        """
        Step 1: Generate four random numbers between 0 and 1
        Step 2: Add these four numbers
        Step 3: Divide each of the four numbers by the sum,
        Step 4: Multiply by 100, and round to the nearest integer.
        """
        num_original_lines = len(original_lines2)*2

        num_random_lines = np.random.uniform(0,1,len(data_files))
        num_random_lines = (num_random_lines / sum(num_random_lines))*num_original_lines

        tmp_all_files = [f for f in data_files if f != file_name]

        random_lines = []    
        for idx, file_name2 in enumerate(tmp_all_files):
            f2 = open(os.path.join(data_dir,file_name2), "r")
            tmp_lines = f2.readlines()
            
            tmp_lines2 = []
            for l in tmp_lines:
                for l2 in l.split("."):
                    tmp_lines2.append(l2)
            f2.close()

            for i in range(int(num_random_lines[idx])):
                random_lines.append(random.choice(tmp_lines2))

        with open(rreplace(os.path.join(data_dir,file_name), "/", "/training_",1), "w") as f3:
            f3.write("{}\t{}\n".format("category", "text"))

            for idx, l in enumerate(original_lines2):
                l = l.replace('\n', ' ').replace('\t', ' ')
                f3.write("{}\t\"{}\"\n".format(1, l))


            for idx, l2 in enumerate(random_lines):
                l2 = l2.replace('\n', '').replace('\t', ' ')
                f3.write("{}\t\"{}\"\n".format(0, l2))
            f3.close()

In [None]:
generate_training_files(BASE_DIR, justif_files)

In [None]:
if FLAG__AGGREGATE_CONVENTIONS_SENTENCES_FROM_GITHUB:
    justif_training_files = [f for f in os.listdir(BASE_DIR) if (os.path.isfile(os.path.join(BASE_DIR, f)) and not f.startswith( '.' ) and not "random" in f and "training_aggregated" in f and not "splitted_" in f)]
    print(justif_training_files)
else:
    justif_training_files = [f for f in os.listdir(BASE_DIR) if (os.path.isfile(os.path.join(BASE_DIR, f)) and not f.startswith( '.' ) and not "random" in f and not "training_aggregated" in f and "training_" in f and not "splitted_" in f)]
    print(justif_training_files)

In [None]:
def create_tokenizer(DATA_DIR, data_files, max_words=MAX_NB_WORDS):
    texts = []
    for d in data_files:
        data_train = pd.read_csv(os.path.join(DATA_DIR,d), sep='\t')
    
        for idx in range(data_train.text.shape[0]):
            text = data_train.text[idx]
            texts.append(str(text))
    
    _tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    _tokenizer.fit_on_texts(texts)
    
    return _tokenizer 
    

In [None]:
conventions_tokenizer = create_tokenizer(BASE_DIR, justif_training_files)

In [None]:
def read_file_and_train_model(DATA_DIR, data_file, tokenizer=None, num_epochs=NUM_EPOCHS):
    ## USING licensing text from github
    data_train = pd.read_csv(os.path.join(DATA_DIR,data_file), sep='\t')
    print(data_train.shape)

    data_train = data_train

    texts = []
    labels = []

    for idx in range(data_train.text.shape[0]):
        text = data_train.text[idx]#BeautifulSoup(data_train.text[idx])
        texts.append(str(text))#clean_str(text))

        labels.append(data_train.category[idx])
        
    if tokenizer is None:
        tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
    else:
        print(" -- Tokenizer has not been retrained")


    sequences = tokenizer.texts_to_sequences(texts)
    #embeddings_index = {}

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]

    
    print('\nNumber of elements from each class in traing and validation set ')
    print(y_train.sum(axis=0))
    print(y_val.sum(axis=0))
    
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    """
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
    l_pool2 = MaxPooling1D(5)(l_cov2)
    #l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
    #l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
    l_flat = Flatten()(l_pool2)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(2, activation='softmax')(l_dense)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    print("model fitting - simplified convolutional neural network")
    model.summary()
    train_history = model.fit(x_train, y_train, validation_data=(x_val, y_val),
              epochs=num_epochs, batch_size=128)
    """
    # applying a more complex convolutional approach
    convs = []
    filter_sizes = [3,4,5]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    for fsz in filter_sizes:
        l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)#Merge(mode='concat', concat_axis=1)(convs)
    l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    #l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
    #l_pool2 = MaxPooling1D(30)(l_cov2)
    #l_flat = Flatten()(l_pool2)
    l_flat = Flatten()(l_pool1)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(2, activation='softmax')(l_dense)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    print("model fitting - more complex convolutional neural network")
    model.summary()
    train_history = model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=num_epochs, batch_size=50)
    
    return model,tokenizer,x_val,y_val, train_history

In [None]:
_conventions_models = {}
_conventions_tokenizers = {}
_conventions_data_val_x = {}
_conventions_data_val_y = {}
_conventions_train_histories = {}

for f in justif_training_files:


    print("----------------------------------------------------------------")
    print("            {}                  ".format(f))
    print("----------------------------------------------------------------")

    _model, _tokenizer, _x_val, _y_val, _train_h = read_file_and_train_model(BASE_DIR, f, tokenizer=conventions_tokenizer)


    _conventions_models[f] = _model
    _conventions_tokenizers[f] = _tokenizer
    _conventions_data_val_x[f] = _x_val
    _conventions_data_val_y[f] = _y_val
    _conventions_train_histories[f] = _train_h

    print("\n\n\n")

## Analying models precisions

In [None]:
def get_model_matches(sequences, model):
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    preds = model.predict(data)

    return preds

def count_positive_preds(preds):
    count = 0
    for j in range(len(preds)):
    
        count += int(preds[j][1]>preds[j][0])
    return count

def count_positive_preds_with_threshold(preds, threshold=0.5):
    count = 0
    for j in range(len(preds)):
    
        count += int(preds[j][1]>=threshold)
    return count

def get_positive_preds_with_threshold(preds, threshold=0.5):

    preds_results = []
    for j in range(len(preds)):
        int_pred = int(preds[j][1]>=threshold)
        
        preds_results.append(int_pred)
    return preds_results

def calculate_matches(repositories_descriptions, _models, _tokenizers, model_preds_func=count_positive_preds, threshold=None):
    _repos_matches = []

    ##Â Counting number of matches per model for each repo
    for r_description in repositories_descriptions:
        r_description = r_description.split("\n")

        this_repo_matches = []
        for model_key in _models.keys():
            matches=0

            tokenized_r_description = _tokenizers[model_key].texts_to_sequences(r_description)

            preds = get_model_matches(tokenized_r_description, _models[model_key])

            if threshold is None:
                num_matches = model_preds_func(preds)
            else:
                num_matches = model_preds_func(preds, threshold)

            this_repo_matches.append(num_matches)

        _repos_matches.append(this_repo_matches)
        
    return _repos_matches
    

In [None]:
def plot_matches_histogram(_matches, _models, title=None, ):

    freqs = np.array(_matches).sum(axis=0)
    elems = [k.replace("training_", "").replace(".txt", "") for k in list(_models.keys())]

    unique_elems = list(set(elems))

    f, ax = plt.subplots()


    plt.barh(unique_elems, freqs)
    if title is None:
        plt.title("Model matches per repository")
    else:
        plt.title("{} model matches per repository".format(title))
    plt.ylabel("Classifiers")
    plt.xlabel("Frequency")
    
    return freqs

In [None]:
thresholds = np.linspace(0,1,100)

_conventions_matches_precissions = {}


for k in _conventions_models.keys():
    
    tmp_precissions = []
    
    my_model = _conventions_models[k]
    my_data_x = _conventions_data_val_x[k]
    my_data_y = [int(x[0]<x[1]) for x in _conventions_data_val_y[k]]

    preds = my_model.predict(my_data_x)
    
    for t in thresholds:
        y_pred = get_positive_preds_with_threshold(preds,t)
        
        tmp_precissions.append(metrics.precision_score(my_data_y, y_pred))
        

    _conventions_matches_precissions[k] = tmp_precissions

In [None]:
plt.figure(figsize=(18,7))

equalized_conv_precissions_threshold=0.725
for k in _conventions_matches_precissions:
    x = thresholds
    y = _conventions_matches_precissions[k]
    
    plt.plot(x,y, marker='', linewidth=2, linestyle='dashed', label="toto")
    
plt.legend(["Precission for "+x.replace("training_", "").replace(".txt", "")+" model"
            for x in list(_conventions_matches_precissions.keys())], fontsize=15)

plt.title("Comparing precission scores for all classifiers", fontdict={'fontsize':20}, pad=25)
plt.axvline(x=equalized_conv_precissions_threshold, color='black', linestyle="-.")

plt.show()

for k in _conventions_matches_precissions:
    print("--{}--".format(k))
    print(_conventions_matches_precissions[k][int(equalized_conv_precissions_threshold*100)])



## Models evaluation on Github Sentences

In [None]:
github_sentences = {}

for i,x in enumerate(justif_files):
    texts = []
    
    data_train = pd.read_csv(os.path.join(BASE_DIR_GIT_HUB,"training_"+x), sep='\t')
    conv = x.replace(".txt", "")
    github_sentences[conv] = data_train


In [None]:
github_sentences['Industrial'].head()

In [None]:
thresholds = np.linspace(0,1,100)

_conventions_matches_github_precissions = {}


for k in _conventions_models.keys():
    
    tmp_precissions = []
    
    
    my_model = _conventions_models[k]
    
    conv = k.replace("training_", "").replace(".txt", "")
    data_train = github_sentences[conv]
    
    
    sequences = conventions_tokenizer.texts_to_sequences(data_train["text"].values)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    
    my_data_x = data
    my_data_y = data_train["category"].values
    

    preds = my_model.predict(my_data_x)
    
    for t in thresholds:
        y_pred = get_positive_preds_with_threshold(preds,t)
        
        tmp_precissions.append(metrics.precision_score(my_data_y, y_pred))
        

    _conventions_matches_github_precissions[conv] = tmp_precissions

In [None]:
plt.figure(figsize=(18,7))

equalized_conv_precissions_github_threshold=0.738
for k in _conventions_matches_github_precissions:
    x = thresholds
    y = _conventions_matches_github_precissions[k]
    
    plt.plot(x,y, marker='', linewidth=2, linestyle='dashed', label="toto")
    
plt.legend(["Precission for "+x.replace("training_", "").replace(".txt", "")+" model"
            for x in list(_conventions_matches_precissions.keys())], fontsize=15)

plt.title("Comparing precission scores for all classifiers", fontdict={'fontsize':20}, pad=25)
plt.axvline(x=equalized_conv_precissions_github_threshold, color='black', linestyle="-.")

plt.show()


for k in _conventions_matches_github_precissions:
    print("--{}--".format(k))
    print(_conventions_matches_github_precissions[k][int(equalized_conv_precissions_github_threshold*100)])