# RE19-classification: enrichment of a new dataset with features determined KM

This notebook takes as input a previously enriched dataset d1 (from notebook 01_reconstruction_KM) and a new dataset d2 to be enriched, and enriches d2 by calculating the values for all the features from .d1.

## 0. Set up (optional)

Run the following install functions if running Jupyter on a cloud environment like Colaboratory, which does not allow you to install the libraries permanently on your local machine

In [1]:
!pip install cython numpy
!pip install benepar[cpu]




## 1. Import libraries

In [2]:
# Basic numpy, sklearn, pandas libraries
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from IPython.display import display

# Basic NLTK tooling
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

# The benepar parser -- this is supposed to be a better parser than Stanford's parser used in the RE'17 paper
import benepar
benepar.download('benepar_en2')

# Tqdm, for progress bars -- useful to show that the parsing is working
from tqdm import tqdm

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1076)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1076)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1076)>
[nltk_data] Error loading benepar_en2: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1076)>


## [Functions] dataset enrichment

Additional features are added, if also present in the original dataset.

In [3]:
def enrich_ling(data):
  # Text length
  if 'Length' in data.columns:
    data['Length'] = 0
    idx = 0
    for x in data['RequirementText']:
        data.at[idx, 'Length'] = len(x)
        idx = idx + 1  

  # POS tags and tree information
  parser = benepar.Parser("benepar_en2")
  if 'Modal' in data.columns:
    data['Modal'] = 0.0
  if 'Adjective' in data.columns:
    data['Adjective'] = 0.0
  if 'Noun' in data.columns:
    data['Noun'] = 0.0
  if 'Adverb' in data.columns:
    data['Adverb'] = 0.0
  if 'Verb' in data.columns:
    data['Verb'] = 0.0
  if 'TreeHeight' in data.columns:
    data['TreeHeight'] = 0
  if 'SubTrees' in data.columns:
    data['SubTrees'] = 0

  idx = 0
  for req in tqdm(data['RequirementText'], desc='Parse trees', position=0):
      tokens = tokenizer.tokenize(req)
      if 'Words' in data.columns:
        data.at[idx, 'Words'] = len(tokens)
      tags = nltk.pos_tag(tokens)
      fd = nltk.FreqDist(tag for (word, tag) in tags)
      for key, value in fd.items():
          if key=="MD" and 'Modal' in data.columns:
              data.at[idx, 'Modal'] = value
          if key.startswith("JJ") and 'Adjective' in data.columns:
              data.at[idx, 'Adjective'] = value
          if key.startswith("VB") and 'Verb' in data.columns:
              data.at[idx, 'Verb'] = value
          if key.startswith("NN") and 'Noun' in data.columns:
              data.at[idx, 'Noun'] = value
          if key=="RB" and 'Adverb' in data.columns:
              data.at[idx, 'Adverb'] = value
      if 'Modal' in data.columns:
        data.at[idx, 'Modal'] = data.at[idx, 'Modal'] / len(tokens)
      if 'Adjective' in data.columns:
        data.at[idx, 'Adjective'] = data.at[idx, 'Adjective'] / len(tokens)
      if 'Noun' in data.columns:
        data.at[idx, 'Noun'] = data.at[idx, 'Noun'] / len(tokens)
      if 'Adverb' in data.columns:
        data.at[idx, 'Adverb'] = data.at[idx, 'Adverb'] / len(tokens)
      if 'Verb' in data.columns:
        data.at[idx, 'Verb'] = data.at[idx, 'Verb'] / len(tokens)       
      tree = parser.parse(req)
      if 'TreeHeight' in data.columns:
        data.at[idx, 'TreeHeight'] = tree.height()
      if 'SubTrees' in data.columns:
        data.at[idx, 'SubTrees'] = len(tree)
      idx = idx + 1    

  print(data.head())

In [4]:
from nltk.util import ngrams
from collections import Counter
from sklearn.feature_extraction import stop_words
from nltk.stem import WordNetLemmatizer

def enrich_ngram(data, file, target, nrfeat):
  bigrams = []
  trigrams = []
  frequencies = Counter([])
  frequencies2 = Counter([])
  frequencies3 = Counter([])
  pfrequencies = Counter([])
  pfrequencies2 = Counter([])
  pfrequencies3 = Counter([])

  wn_lemmatizer = WordNetLemmatizer()

  # Generation of [1, 2, 3] textgrams, [1, 2, 3] POSgrams
  for req in tqdm(data['RequirementText'], desc='n-grams generation', position=0):
      token = tokenizer.tokenize(req)
      token = [word.lower() for word in token]
      tags = nltk.pos_tag(token)
      token = [w for w in token if not w in stop_words.ENGLISH_STOP_WORDS]
      token = [wn_lemmatizer.lemmatize(w) for w in token]
      frequencies += Counter(token)
      bigrams = ngrams(token,2)
      trigrams = ngrams(token,3)
      frequencies2 += Counter(bigrams)
      frequencies3 += Counter(trigrams)
      punigrams = [tag for (word, tag) in tags]
      pfrequencies += Counter(punigrams)
      pbigrams = ngrams([tag for (word, tag) in tags], 2)
      pfrequencies2 += Counter(pbigrams)
      ptrigrams = ngrams([tag for (word, tag) in tags], 3)
      pfrequencies3 += Counter(ptrigrams)

  #print (len(frequencies), len(frequencies2), len(frequencies3), len(pfrequencies), len(pfrequencies2), len(pfrequencies3))

  # Populating the n-grams
  idx = 0
  for req in tqdm(data['RequirementText'], desc='n-grams population', position=0):
      token = tokenizer.tokenize(req)

      for t in token:
        exists = [col for col in data.columns if col == str('_' + t + '_')]
        if exists != []:
          data.at[idx, exists] = 1

      bigrams = ngrams(token,2)
      for bg in bigrams:
        exists = [col for col in data.columns if col == str('_' + bg[0] + '_' + bg[1] + '_')]
        if exists != []:
          data.at[idx, exists] = 1

      trigrams = ngrams(token,3)
      for tg in trigrams:
        exists = [col for col in data.columns if col == str('_' + tg[0] + '_' + tg[1] + '_' + tg[2] + '_')]
        if exists != []:
          data.at[idx, exists] = 1

      tags = nltk.pos_tag(token)

      for t in tags:
        exists = [col for col in data.columns if col == str(t)]
        if exists != []:
          data.at[idx, exists] = 1

      pbigrams = ngrams([tag for (word, tag) in tags], 2)
      for bg in pbigrams:
        exists = [col for col in data.columns if col == str(bg[0] + '_' + bg[1])]
        if exists != []:
          data.at[idx, exists] = 1

      ptrigrams = ngrams([tag for (word, tag) in tags], 3)
      for tg in ptrigrams:
        exists = [col for col in data.columns if col == str(tg[0] + '_' + tg[1] + '_' + tg[2])]
        if exists != []:
          data.at[idx, exists] = 1

      idx = idx + 1

  data = data.fillna(0.0)

  # for column in data.columns:
  #   if data[column].isnull().sum()==len(data):
  #     data[column] = 0.0

  data.columns = data.columns.map(str)

  print (data.head())

  # The new enriched dataset is now saved
  data.to_csv(file + '-tagged-' + str(nrfeat) + '-' + target + '.csv', encoding='utf-8')

In [5]:
def enrich_dataset(file, target, nrfeat, data):
  data = data[0:0]

  # Take the new dataset

  data2 = pd.read_csv(file + '.csv', engine='python')
  data['RequirementText'] = data2['RequirementText']
  data['ProjectID'] = 1
  data['Class'] = 'F'
  if target == 'q' or target == 'f':
    data['IsFunctional'] = data2['IsFunctional']
    data['IsQuality'] = data2['IsQuality']

  if target == 'oq':
    data['OnlyQuality'] = ~data2['IsFunctional'] & data2['IsQuality']

  if target == 'of':
    data['OnlyFunctional'] = data2['IsFunctional'] & ~data2['IsQuality']


  data = data.drop(data.columns[0], axis=1)

  print (data.head())

  enrich_ling(data)
  enrich_ngram(data, file, target, nrfeat)

## 3. Main file

Imports the enriched data set and the new dataset, then invokes the other functions

In [6]:
# Define the files that you want to process here.  
source_folder = './'
filename = ['dronology', 'ds2', 'ds3', 'reqview', 'wasp', 'leeds', 'esa-eucl-est'] #the datasets to enrich
target_type = ['f', 'q', 'of', 'oq'] #the target class for the classification (different classes have different top_n features)
top_n_feat = 100

for f in filename:
  for t in target_type:
    # Loading the originally enriched PROMISE data set
    data = pd.read_csv(source_folder+'promise-km-' + str(top_n_feat) + '-' + t + '.csv', engine='python')
    # enrich the new one
    enrich_dataset(file=source_folder+f, target=t, nrfeat=top_n_feat, data=data)

FileNotFoundError: [Errno 2] No such file or directory: './promise-km-100-f.csv'