# Iterative AIVC models building

### Implemented pipeline:

#### 1.- Data gathering
#### 2.- Classify data (probabilistic classification)
#### 3.- Sample data from each strate of classifications (high confidence, medium, low)
#### 4.- Manually validate data
#### 5.- Append data to training files
#### 6.- Retrain new models
#### 7.- Gather new data

In [30]:
import pandas as pd
import numpy as np

import os

import pickle

## Deep Learning imports for the classifiers
os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Concatenate
from keras.models import Model

In [47]:
## Classificaiton NETWORKs Configuration parameters
MAX_SEQUENCE_LENGTH = 32
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 100 ## 100, 200 or 300
VALIDATION_SPLIT = 0.2

NUM_EPOCHS = 20

This notebook expected the following hiearchy of files:

- BASE_DIR
     - Gathered_data
         - Conventions
         - Software_characteristics
     - Classification_results
         - Conventions
         - Software_characteristics
     - Training_data
         - Conventions
         - Software_characteristics

In [29]:
MODELS_DIR = "./Data/VALIDATED_DATA/"

BASE_DIR = "./Data/Iterative-models-building"
GATHERED_DATA_FOLDER = os.path.join(BASE_DIR, "Gathered_data")

CONVS_DIR_NAME = "Conventions"
SOFT_CHARS_DIR_NAME = "Software_characteristics"

GATHERED_SOFTWARE_CHARS_DIR = os.path.join(GATHERED_DATA_FOLDER, SOFT_CHARS_DIR_NAME)
GATHERED_CONVENTIONS_DIR = os.path.join(GATHERED_DATA_FOLDER, CONVS_DIR_NAME)

In [58]:
RESULTS_SAMPLING_PERCENTAJE = 30

## 1.- Data gathering

To be done for different sources, to have high variability. It's important to keep tracking of where does each sentence come from (add a label of provenance).
Identified data sources:
    - Google
    - Github
    - Semantic Scholar
    
    


In [36]:
## Helper functions
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)


def clean_file_name(name, replacements2=[]):
    
    replacements=[".txt", ".csv", ".tsv"]
    
    for r in replacements:
        name = name.replace(r, "")
        
    for r in replacements2:
        name = name.replace(r, "")
    
    return name

### 1.1 Read conventions data

In [41]:
gathered_conventions_data = {}

#For each file in gathered conventions folder
gathered_conventions_files = [f for f in os.listdir(GATHERED_CONVENTIONS_DIR) if (os.path.isfile(os.path.join(GATHERED_CONVENTIONS_DIR, f)) and not f.startswith( '.' ) and not "random" in f and "gathered_" in f)]
gathered_conventions_files

for f in gathered_conventions_files:
    
    gathered_conventions_data[clean_file_name(f)] = pd.read_csv(os.path.join(GATHERED_CONVENTIONS_DIR, f), sep='\t')
    

### 1.2 Read software characteristics data

In [40]:
gathered_software_characteristics_data = {}

#For each file in gathered software characteristics folder
gathered_characteristics_files = [f for f in os.listdir(GATHERED_SOFTWARE_CHARS_DIR) if (os.path.isfile(os.path.join(GATHERED_SOFTWARE_CHARS_DIR, f)) and not f.startswith( '.' ) and not "random" in f and "gathered_" in f)]
gathered_characteristics_files

for f in gathered_characteristics_files:
    gathered_software_characteristics_data[clean_file_name(f)] = pd.read_csv(os.path.join(GATHERED_SOFTWARE_CHARS_DIR, f), sep='\t')
    

## 2.- Clasification

Classify, using each of the classifiers, the gathered sentences

In [27]:
## Read pickled classifiers
## Load convention models from pickle file
with open(os.path.join(MODELS_DIR, 'conv_models_items.pickle'), 'rb') as f:


    convention_convnet_items = pickle.load(f)

    _conventions_models = convention_convnet_items['model'] 
    _conventions_tokenizers = convention_convnet_items['tokenizer'] 
    _conventions_data_val_x = convention_convnet_items['_x_val'] 
    _conventions_data_val_y = convention_convnet_items['_y_val'] 
    _conventions_train_histories = convention_convnet_items['train_history'] 
    
## Load convention models from pickle file
with open(os.path.join(MODELS_DIR, 'charact_models_items.pickle'), 'rb') as f:


    characteristics_convnet_items = pickle.load(f)

    _characteristics_models = characteristics_convnet_items['model'] 
    _characteristics_tokenizers = characteristics_convnet_items['tokenizer'] 
    _characteristics_data_val_x = characteristics_convnet_items['_x_val'] 
    _characteristics_data_val_y = characteristics_convnet_items['_y_val'] 
    _characteristics_train_histories = characteristics_convnet_items['train_history'] 


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])







  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [178]:
def get_model_matches_proba(sequences, model):
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    preds = model.predict(data)

    return preds[:,1]

In [179]:
def calculate_matches(sentences, _models, _tokenizers):
    _repos_matches = {}

    ## Getting classification confidence per model for each repo
    for model_key in _models.keys():
            
        tokenized_sentences = _tokenizers[model_key].texts_to_sequences(sentences)
        preds = get_model_matches_proba(tokenized_sentences, _models[model_key])

        _repos_matches[clean_file_name(model_key)] = preds
    return _repos_matches

In [210]:
## TODO: Put here all sentences from gathered data
data_sentences = np.array(["This would be talking about efficiency in industrial terms", "just one random example", "please, classify this a green world"])
data_sentences_provenance = np.array(["Google", "Github", "Semantic Scholar"])

In [211]:
conventions_classifications = calculate_matches( data_sentences, _conventions_models, _conventions_tokenizers)
conventions_classifications

{'training_Domestic': array([0.42488375, 0.16314507, 0.14291137], dtype=float32),
 'training_Civic': array([0.11745402, 0.50928205, 0.55491126], dtype=float32),
 'training_Project': array([0.21544506, 0.06054401, 0.02405908], dtype=float32),
 'training_Inspired': array([0.45563355, 0.5758446 , 0.57550967], dtype=float32),
 'training_Market': array([0.7195272 , 0.05723041, 0.0197148 ], dtype=float32),
 'training_Industrial': array([0.7223793 , 0.5822472 , 0.05261192], dtype=float32),
 'training_Renown': array([0.27047116, 0.43848282, 0.43897298], dtype=float32)}

In [212]:
characteristics_classifications = calculate_matches( data_sentences, _characteristics_models, _characteristics_tokenizers)
characteristics_classifications

{'training_advantages': array([0.46067676, 0.34974173, 0.05037144], dtype=float32),
 'training_usability': array([0.3161091 , 0.12540503, 0.27094385], dtype=float32),
 'training_contributions': array([0.00117153, 0.66798824, 0.9999989 ], dtype=float32),
 'training_efficiency': array([9.9969029e-01, 1.6380979e-01, 2.3044298e-04], dtype=float32),
 'training_licensing': array([1.6415383e-08, 1.7161448e-05, 5.8574395e-05], dtype=float32),
 'training_reliability:maintanability': array([9.5781392e-01, 8.8595971e-04, 9.3447906e-04], dtype=float32),
 'training_functionalities': array([6.1490632e-06, 3.1083138e-03, 3.4248501e-03], dtype=float32),
 'training_portability': array([3.6413821e-05, 8.5168773e-05, 1.5477547e-04], dtype=float32)}

## 3.- Sample data from each strate of classifications (high confidence, medium, low)

In [223]:
class ClassificationResult:
        
    def __init__(self, text, value, level, provenance='Unknown'):
        self.text = text[0]
        self.confidence_value = value[0]
        self.confidence_level = level
        self.data_provenance = provenance[0]
        
    def __str__(self):
        return "{} -- {} -- {}".format(self.text, self.confidence_value, self.confidence_level)
    
    def __repr__(self):
        return "{} -- {} -- {}".format(self.text, self.confidence_value, self.confidence_level)
        
        
def split_sentences_by_confidence (calculated_classifications, _sentences, _sentences_provenance):

    stratified_classifications = {}

    for k in calculated_classifications.keys():
        classifications = calculated_classifications[k]

        ## Low level percentile
        low_percentile = np.percentile(classifications, 33)
        classifications_low = np.where(classifications<=low_percentile)
        #print(classifications_low)

        ## Medium level percentile
        medium_percentile = np.percentile(classifications, 66)
        classifications_medium = np.where((classifications<=medium_percentile) & (classifications>low_percentile))
        #print(classifications_medium)

        ## High level percentile
        top_percentile = np.percentile(classifications, 100)
        classifications_top = np.where((classifications<=top_percentile) & (classifications>medium_percentile))
        #print(classifications_top)

        classified_sentences = []
        for i1 in classifications_low:
            c1 = ClassificationResult(_sentences[i1], classifications[i1], "Low", _sentences_provenance[i1])
            classified_sentences.append(c1)

        for i2 in classifications_medium:
            c2 = ClassificationResult(_sentences[i2], classifications[i2], "Medium", _sentences_provenance[i2])
            classified_sentences.append(c2)

        for i3 in classifications_top:
            c3 = ClassificationResult(_sentences[i3], classifications[i3], "High", _sentences_provenance[i3])
            classified_sentences.append(c3)



        stratified_classifications[clean_file_name(k)] = classified_sentences
        
    return stratified_classifications

In [224]:
## Conventions results sampling
conv_stratified_classifications = split_sentences_by_confidence(conventions_classifications, data_sentences, data_sentences_provenance)

for k in conv_stratified_classifications.keys():
    with open(os.path.join(BASE_DIR, "Classification results", CONVS_DIR_NAME, "{}_stratified_classifications.tsv".format(k)), "w")as f3:

        f3.write("{}\t{}\t{}\t{}\n".format("text", "confidence_value", "confidence_level", "data_provenance"))
    
        for c in conv_stratified_classifications[k]:
            
            f3.write("{}\t{}\t{}\t{}\n".format(c.text, c.confidence_value, c.confidence_level, c.data_provenance))

        f3.close()
    

In [225]:
print(conv_stratified_classifications['training_Domestic'][0].text)
print(conv_stratified_classifications['training_Domestic'][1].text)

please, classify this a green world
just one random example


In [226]:
## Characteristics results sampling
chars_stratified_classifications = split_sentences_by_confidence(characteristics_classifications, data_sentences, data_sentences_provenance)
for k in conv_stratified_classifications.keys():
    with open(os.path.join(BASE_DIR, "Classification results", SOFT_CHARS_DIR_NAME, "{}_stratified_classifications.tsv".format(k)), "w")as f3:

        f3.write("{}\t{}\t{}\n".format("text", "confidence_value", "confidence_level", "data_provenance"))
    
        for c in conv_stratified_classifications[k]:
            
            f3.write("{}\t{}\t{}\t{}\n".format(c.text, c.confidence_value, c.confidence_level, c.data_provenance))

        f3.close()


## Next steps to be done manually:

### ~~4.- Manually validate data~~
### ~~5.- Append data to training files ~~



## 6.- Retrain new models