<a href="https://colab.research.google.com/github/bmnapoleao/SLR-Automated_selection_of_studies/blob/main/notebook1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary: Using predict() instead of predict_proba()

# Summary: Only running predict_proba ONCE

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.patches as mpatches
from IPython.display import display
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn import metrics
import networkx as nx
from sklearn.decomposition import PCA
import seaborn as sns

In [2]:
# Lista de artigos duplicados (encontrados a partir de um script python)
DUPLICATED_LST = ['Leveraging organizational climate theory for understanding industry-academia collaboration', 'Meeting Industry-Academia Research Collaboration Challenges with Agile Methodologies', 'Characterizing industry-academia collaborations in software engineering: evidence from 101 projects', 'Protocol and Tools for Conducting Agile Software Engineering Research in an Industrial-Academic Setting: A Preliminary Study', 'Industry-Academia research collaboration in software engineering: The Certus model', 'Fostering Industry-Academia Collaboration in Software Engineering Using Action Research: A Case Study', 'A case study of industry--academia communication in a joint software engineering research project', 'Lessons Learned on Research Co-Creation: Making Industry-Academia Collaboration Work', 'Industry-Academia Collaborations in Software Engineering: An Empirical Analysis of Challenges, Patterns and Anti-Patterns in Research Projects', 'Knowledge Management in University-Software Industry Collaboration']

In [3]:
len(DUPLICATED_LST)

10

# Number of features used in Feature Selection

In [4]:
# k_fs = 100  
# k_fs = 500  
# k_fs = 750  # DONE
k_fs = 1000 
# k_fs = 1500 
k_sufix = 'k{}'.format(k_fs)

# Choose dataset to use and where to save results

In [5]:
# To use full dataset (2hrs to run feature selection)
#   Training Paths to read from
file_path_included_training = '/content/drive/MyDrive/cslr/bibs/training-sets/training_included.bib'
file_path_excluded_training = '/content/drive/MyDrive/cslr/bibs/training-sets/training_excluded.bib'
#   Testing Paths to read from
file_path_included_testing = '/content/drive/MyDrive/cslr/bibs/testing-sets/testing_included_formated.bib'
file_path_excluded_testing = '/content/drive/MyDrive/cslr/bibs/testing-sets/testing_excluded_formated.bib'
#   Using complete dataset (result's dir on drive)
result_file_path = '/content/drive/MyDrive/cslr/results-dec/report_{}.csv'.format(k_sufix)

###########################################################################################################

# # # To use small sample from dataset
# #   Training Paths to read from
# file_path_included_training = '/content/drive/MyDrive/cslr/bibs/small-samples/training-set-included.bib'
# file_path_excluded_training = '/content/drive/MyDrive/cslr/bibs/small-samples/training-set-excluded.bib'
# #   Testing Paths to read from
# file_path_included_testing =  '/content/drive/MyDrive/cslr/bibs/small-samples/testing-set-included.bib'
# file_path_excluded_testing =  '/content/drive/MyDrive/cslr/bibs/small-samples/testing-set-excluded.bib'
# ##   Using small samples / part of dataset (result's dir on drive)
# k_fs = 100  # Use small number of features
# k_sufix = 'k{}'.format(k_fs)
# result_file_path = '/content/drive/MyDrive/cslr/results-dec/to-remove-test.csv'


# Selection of Studies

Marcelo Costalonga


## Bib Parser

######  # BibParser(write_files=False, project_folder=project_folder, only_titles=True),

In [6]:
!pip install bibtexparser

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Imports

In [7]:
import codecs, bibtexparser

import keras_preprocessing.sequence
import numpy as np

from keras.preprocessing.text import Tokenizer

### Loading bibs from drive (BibTexParser)

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
def read_bib(file_path, set_name: str, was_accepted: bool):
    texts_list = list()
    titles_list = list()
    duplicated_titles = dict()

    with codecs.open(file_path, 'r', encoding='utf-8') as bib_file:
        db = bibtexparser.load(bib_file)
        for bib_index, entry in enumerate(db.entries, start=0):
            category = 'selecionado' if was_accepted == True else 'removido'
            title = entry['title']
            abstract = entry['abstract']
            year = entry['year']
            content = u'%s\n%s' % (title, abstract)
            folder = file_path.split('/')[2].split('-')[0]

            if (title not in titles_list): 
                titles_list.append(title)
                content = content.split('\n')[0]
                texts_list.append({
                    'title': title,
                    'content': content,
                    'category': category,
                    'year': int(year)
                })
            else:
                if title in duplicated_titles:
                    duplicated_titles[title] += 1  
                else:
                     duplicated_titles[title] = 2

        bib_file.close()

        if (len(duplicated_titles) > 0):
            print("\nFound {} duplicated entries on set: {}".format(len(duplicated_titles), set_name))
            print("Each of the following entries were found Nx times (number the same title was found) and were ignored:")
            for title in duplicated_titles:
                print("\t({}x) - {}".format(duplicated_titles[title], title))

        return texts_list, titles_list

In [10]:
# Initialize lists
texts_list_included_training = []
texts_list_excluded_training = []
texts_list_included_testing = []
texts_list_excluded_testing = []

# Read bibs according to each set (training/testing - excluded/included)
texts_list_included_training, titles_included_training = read_bib(file_path_included_training, 'Training - Included', True)
texts_list_excluded_training, titles_excluded_training = read_bib(file_path_excluded_training, 'Training - Excluded', False)
texts_list_included_testing, titles_included_testing = read_bib(file_path_included_testing, 'Testing - Included', True)
texts_list_excluded_testing, titles_excluded_testing = read_bib(file_path_excluded_testing, 'Training - Excluded', False)

print("\nNumber of entries in Training Set Included:", len(texts_list_included_training))
print("Number of entries in Training Set Excluded:", len(texts_list_excluded_training))
print("Number of entries in Testing Set Included:", len(texts_list_included_testing))
print("Number of entries in Testing Set Excluded:", len(texts_list_excluded_testing))
print("Total number of entries before removing dupplicates:", len(texts_list_included_training) + len(texts_list_excluded_training) + 
      len(texts_list_included_testing) + len(texts_list_excluded_testing))



Found 1 duplicated entries on set: Testing - Included
Each of the following entries were found Nx times (number the same title was found) and were ignored:
	(2x) - Protocol and Tools for Conducting Agile Software Engineering Research in an Industrial-Academic Setting: A Preliminary Study

Found 15 duplicated entries on set: Training - Excluded
Each of the following entries were found Nx times (number the same title was found) and were ignored:
	(2x) - Applying Data Analytics towards Optimized Issue Management: An Industrial Case Study
	(2x) - Wide Band Patch Antenna Structures for Cognitive Radio Applications
	(2x) - Exploring the industry's challenges in software testing: An empirical study
	(2x) - Benefits and Drawbacks of Software Reference Architectures
	(2x) - Start-up ecosystems as a framework for the cooperation between start-up companies and knowledge-based institutions in Poland
	(2x) - How to Increase the Likelihood of Successful Transfer to Industry -- Going Beyond the Empi

# Treating duplicates entries

In [11]:
from copy import deepcopy
import traceback

def remove_duplicates(text_lst, title_lst):
    cp_text_lst = deepcopy(text_lst)
    cp_title_lst = deepcopy(title_lst)

    print("Before:", len(cp_text_lst))

    for txt_obj in text_lst:
        # if txt_obj['content'] in DUPLICATED_LST:
        if txt_obj['title'] in DUPLICATED_LST:
            # print(txt_obj['title'])
            if txt_obj in cp_text_lst:
                cp_text_lst.remove(txt_obj)
                cp_title_lst.remove(txt_obj['title'])

    print("After:", len(cp_text_lst))
    return cp_text_lst, cp_title_lst


# texts_list_included_testing = remove_duplicates(texts_list_included_testing)
texts_list_excluded_testing, titles_excluded_testing = remove_duplicates(
    texts_list_excluded_testing, titles_excluded_testing)

Before: 482
After: 472


In [12]:
print("Number of entries in Training Set Included:", len(texts_list_included_training))
print("Number of entries in Training Set Excluded:", len(texts_list_excluded_training))
print("Number of entries in Testing Set Included:", len(texts_list_included_testing))
print("Number of entries in Testing Set Excluded:", len(texts_list_excluded_testing))
print("Total number of entries after removing dupplicates:", len(texts_list_included_training) + len(texts_list_excluded_training) + 
      len(texts_list_included_testing) + len(texts_list_excluded_testing))

Number of entries in Training Set Included: 45
Number of entries in Training Set Excluded: 86
Number of entries in Testing Set Included: 35
Number of entries in Testing Set Excluded: 472
Total number of entries after removing dupplicates: 638


In [13]:
def get_difference(l1, s1):
    lst_copy = deepcopy(l1)
    for i in s1:
        if i in lst_copy:
            lst_copy.remove(i)
    return lst_copy

def validate_texts(train_in, train_ex, test_in, test_ex):
    # Assert there are no duplicates
    duplicated_entries = list()
    try:
        train_titles = train_in + train_ex
        train_tuple = set(train_in + train_ex)
        assert len(train_titles) == len(train_tuple)
    except AssertionError:
        print("\nTraining Sets have duplicated entries!!")
        print("Len l1+l2={} vs Len set={}".format(len(train_titles), 
                                                  len(train_tuple)))
        duplicated_entries = get_difference(train_titles, train_tuple)
        print("The Following Entries appear both in Included and Excluded \
        Training sets:\n", duplicated_entries)

    try:
        test_titles = test_in + test_ex
        test_tuple = set(test_in + test_ex)
        assert len(test_titles) == len(test_tuple)
    except AssertionError:
        print("\nTesting Sets have duplicates!!")
        print("Len l1+l2={} vs Len set={}".format(len(test_titles), 
                                                  len(test_tuple)))    
        duplicated_entries = get_difference(test_titles, test_tuple)
        print("The Following Entries appear both in Included and Excluded Testing sets:\n")
        for i in duplicated_entries:
            print("\t - {}".format(i))

    try:
        all_titles = train_titles + test_titles
        all_tuple = set(train_titles + test_titles)
        assert len(all_titles) == len(all_tuple)
    except AssertionError:
        print("\nTraining set contains entries that are also in Testing set!!")
        print("Len l1+l2={} vs Len set={}".format(len(all_titles), 
                                                  len(all_tuple)))    
        print("The Following Entries appear both in Testing and Training sets: \
        \n", duplicated_entries)

    assert len(duplicated_entries) == 0, "Found errors"
    return duplicated_entries

# Validate Entries
duplicated_entries = validate_texts(titles_included_training, titles_excluded_training, 
               titles_included_testing, titles_excluded_testing)



In [14]:
for i in duplicated_entries:
    print(i)

In [15]:
# Split into two lists (training / testing)
texts_list_training = texts_list_included_training + texts_list_excluded_training
texts_list_training = sorted(texts_list_training, key=lambda d: d['year'])

texts_list_testing = texts_list_included_testing + texts_list_excluded_testing
texts_list_testing = sorted(texts_list_testing, key=lambda d: d['year'])

all_texts_list = texts_list_training + texts_list_testing
all_texts_list = sorted(all_texts_list, key=lambda d: d['year'])

print(len(all_texts_list))
# for text in all_texts:
#     print(text, len(text))

638


## Text Filtering (preprocessing)

######  # TextFilterComposite([ LemmatizerFilter(), StopWordsFilter() ]),

### Imports

In [16]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [17]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Text Filters

In [18]:
class TextFilterComposite:
    def __init__ (self, filters):
        self._filters = filters

    def _filter (self, tokens):
        result = tokens
        for f in self._filters:
            result = f.filter(result)
        return (' ').join(result)

class LemmatizerFilter:
    def __init__ (self):
        print('===== Configure the lemmatizer =====')
        self._lemmatizer = WordNetLemmatizer()

    def filter (self, tokens):
        tags = pos_tag(tokens)
        return [ self._lemmatizer.lemmatize(token[0], pos=token[1][0].lower())
                    if token[1][0].lower() in ('a', 'n', 'v', 'r')
                    else self._lemmatizer.lemmatize(token[0])
                    for token in tags ]

class StopWordsFilter:
    def __init__ (self):
        print('===== Configuring stop words removal =====')

    def filter (self, tokens):
        return [ word for word in tokens
                 if not word.lower() in stopwords.words('english') ]


In [19]:
# Tokenization 
filters = [LemmatizerFilter(), StopWordsFilter()]
textFilterObj = TextFilterComposite(filters)

def filter_text(text_list):
    result = []
    for text in text_list:
        tokens = word_tokenize(text['content'])
        
        # Converts list to filtered str
        #   E.g.: ['Research', 'in', 'computer', 'science', ':', 'an', 'empirical', 'study'] -> 'Research computer science : empirical study'
        filtered_text = textFilterObj._filter(tokens)

        result.append({
            'content': filtered_text.lower(),
            'category': text['category'],
            'year': text['year']
        })
    return result


===== Configure the lemmatizer =====
===== Configuring stop words removal =====


In [20]:
## 2) Calling filter method two times for each list (test | train)
filtered_texts_list_testing = filter_text(texts_list_testing)
filtered_texts_list_training = filter_text(texts_list_training)

In [21]:
print(len(filtered_texts_list_testing), filtered_texts_list_testing)
print(len(filtered_texts_list_training), filtered_texts_list_training)

507 [{'content': 'operations technology organizational structure .', 'category': 'removido', 'year': 1976}, {'content': 'design science nested problem solve', 'category': 'removido', 'year': 2009}, {'content': "large-scale empirical study practitioners ' use object-oriented concepts", 'category': 'removido', 'year': 2010}, {'content': 'planning unknown : lessons learned ten months non-participant exploratory observations industry', 'category': 'selecionado', 'year': 2015}, {'content': 'exploratory study technology transfer software engineering', 'category': 'selecionado', 'year': 2015}, {'content': 'fast feedback cycles empirical software engineering research', 'category': 'selecionado', 'year': 2015}, {'content': 'integration se research industry : reflections , theories illustrative example', 'category': 'selecionado', 'year': 2015}, {'content': 'towards approach matching cmd dsr improve academia-industry software development partnership : case agile ux integration', 'category': 'rem

In [22]:
print("Size of Test set:", len(filtered_texts_list_testing))
print("Size of Trainset:", len(filtered_texts_list_training))

Size of Test set: 507
Size of Trainset: 131


## Generate Dataset

######  # GenerateDataset(TfidfVectorizer(ngram_range=(1,3), use_idf=True)),

### Imports

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
class GenerateDataset:
    def __init__ (self, vectorizer=TfidfVectorizer()):
        self._vectorizer = vectorizer

datasetGenerator = GenerateDataset(TfidfVectorizer(ngram_range=(1,3), use_idf=True))

In [25]:
def generate_dataset(filtered_text_list):
    texts = [ text_data['content'] for text_data in filtered_text_list ]

    # Category: 1 == 'selecionado' | 0 == 'removido'
    categories = [ 1 if text_data['category'] == 'selecionado' else 0 for text_data in filtered_text_list ]
    years = [ text_data['year'] for text_data in filtered_text_list ]
    features = datasetGenerator._vectorizer.fit_transform(texts)

    dataset = {
        'texts': texts,
        'features': features,
        'categories': np.array(categories),
        'years': years
    }

    print(dataset['features'].shape)
    return dataset


In [26]:
## 2) Calling method two times for each list (test | train)
dataset_training = generate_dataset(filtered_texts_list_training)
dataset_testing = generate_dataset(filtered_texts_list_testing)

(131, 1707)
(507, 7465)


In [27]:
print(dataset_testing['features'].max) 

<bound method _minmax_mixin.max of <507x7465 sparse matrix of type '<class 'numpy.float64'>'
	with 11044 stored elements in Compressed Sparse Row format>>


# Uses Feature Selection

In [28]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectKBest, chi2

In [29]:
class USESFeatureSelection:
    def __init__ (self, k=-1):
        assert k > 0
        self._k = k
        self._affinity_score = []

    def _affinity (self, word_frequency_column, categories, category):
        ncw = 0
        nc = 0
        nw = 0
        for i in range(0, len(categories)):
            if categories[i] == category: nc += 1
            if word_frequency_column[i] > 0: nw += 1
            if categories[i] == category and word_frequency_column[i] > 0: ncw += 1
        return ncw / (nc + nw - ncw)


    def _score (self, features, categories):
        n_words = features.shape[1]
        self._affinity_score = [
            self._affinity(features[:,i], categories, 1) -
            self._affinity(features[:,i], categories, 0)
            for i in range(0, n_words) ]
        return (self._affinity_score, [])

    def execute(self, dt):
        print('===== Feature selection - USES =====')
        X = dt['features']
        y = dt['categories']
        fs = SelectKBest(self._score, k=self._k)
        dt['features'] = fs.fit_transform(X, y)
        print(dt['features'].shape)
        return dt


# Set number of K features at the begining of the file

In [30]:
# Initialize Feature Selection Class
featureSelection = USESFeatureSelection(k=k_fs)

In [31]:
# Feature Selection for training
dataset_training_kN_fs = featureSelection.execute(dataset_training)

===== Feature selection - USES =====
(131, 1000)


In [32]:
# Feature Selection for testing
dataset_testing_kN_fs = featureSelection.execute(dataset_testing)

===== Feature selection - USES =====
(507, 1000)


### K-Fold Split

In [33]:
import random
from keras import layers
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn import tree, metrics, svm, naive_bayes, ensemble, linear_model, neural_network
from sklearn.model_selection import cross_validate, TimeSeriesSplit, GridSearchCV, train_test_split

#### Define K-Fold split method

In [34]:
class YearsSplit:
    def __init__ (self, n_splits=4, years=[]):
        self._n_splits = n_splits
        self._years = years
        self._test_indexes = []
        current = max(years)
        for i in range(n_splits):
            test_index = years.index(current)
            if len(years[test_index:]) < 5:
                current = max(years[:test_index])
                test_index = years.index(current)

            self._test_indexes.append(test_index)
            current = max(years[:test_index])

    def split (self, X, y, groups=None):
        previous = len(self._years)
        for test_index in self._test_indexes:
            train = [ i for i in range(test_index) ]
            test = [ i for i in range(test_index, previous) ]
            previous = test_index
            yield train, test


### Define simple classifier (SVM and DT)

In [35]:
class SimpleClassifier:
    def __init__ (self, seed, n_splits=5):
        self._seed = seed
        self._n_splits = n_splits

    def execute (self, dataset_train, dataset_test):
        print("Executing...")
        X = dataset_train['features']
        y = dataset_train['categories']   
        
        groups = dataset_train['years']
        random.seed(self._seed)

        kfold = YearsSplit(n_splits=self._n_splits, years=groups)                 
        model = self.get_classifier(X, y)   # Pega modelo de classifier 

        scores = cross_validate(model, X, y, cv=kfold, scoring=['f1_macro', 'precision_macro', 'recall_macro'])
        print("OUR APPROACH F-measure: %s on average and %s SD" %
                (scores['test_f1_macro'].mean(), scores['test_f1_macro'].std()))
        print("OUR APPROACH Precision: %s on average and %s SD" %
                (scores['test_precision_macro'].mean(), scores['test_precision_macro'].std()))
        print("OUR APPROACH Recall: %s on average and %s SD" %
                (scores['test_recall_macro'].mean(), scores['test_recall_macro'].std()))

        X_train = dataset_train['features']
        y_train = dataset_train['categories']
        X_test = dataset_test['features']
        y_test = dataset_test['categories']

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test) 

        # TODO: Remove print
        print("Y_PRED #1:", len(y_pred), "\n", y_pred, "\n")

        scores['probabilities'] = y_pred
        scores['y_test'] = y_test

        correct_exclusion_rate = []
        threasholds = []
        missed = []
        fscore_threashold = []
        exclusion_baseline = []
        missed_baseline = []
        for train_index, test_index in kfold.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train)

            y_score = model.predict_proba(X_train)[:, 1] 
            
            # TODO: Remove print
            print("Y_PRED #2:", len(y_score), "\n", y_score, "\n")
            
            precision, recall, threasholds2 = metrics.precision_recall_curve(
                    y_train, y_score)
            y_score = model.predict_proba(X_test)[:, 1]  
        
            # TODO: Remove print
            print("Y_PRED #3:", len(y_score), "\n", y_score, "\n")
        
            if (threasholds2[0] > 0.5):
                threasholds2 = [0.5]
            matrix = metrics.confusion_matrix(
                    y_test, [ 0 if i < threasholds2[0] else 1 for i in y_score ])
            correct_exclusion_rate.append(
                    matrix[0, 0] /
                    (matrix[0, 0] + matrix[1, 1] + matrix[0, 1] + matrix[1, 0]))
            missed.append(matrix[1, 0] / (matrix[1, 1] + matrix[1, 0]))
            threasholds.append(threasholds2[0])
            fscore_threashold.append(metrics.f1_score(
                y_test, [ 0 if i < threasholds2[0] else 1 for i in y_score ]))

            matrix = metrics.confusion_matrix(
                    y_test, [ 0 if i <  0.5 else 1 for i in y_score ])
            exclusion_baseline.append(
                    matrix[0, 0] /
                    (matrix[0, 0] + matrix[1, 1] + matrix[0, 1] + matrix[1, 0]))
            missed_baseline.append(matrix[1, 0] / (matrix[1, 1] + matrix[1, 0]))

        scores['exclusion_rate'] = correct_exclusion_rate
        scores['threasholds'] = threasholds
        scores['missed'] = missed
        scores['fscore_threashold'] = fscore_threashold
        scores['exclusion_baseline'] = exclusion_baseline
        scores['missed_baseline'] = missed_baseline
        
        dataset = dict()
        dataset['%s_scores' % self.classifier_name] = scores
        
        return dataset, y_pred

#### Extend Classifiers 

In [36]:
class DecisionTreeClassifier (SimpleClassifier):
    def __init__ (self, seed=42, criterion='entropy', n_splits=5):
        SimpleClassifier.__init__(self, seed, n_splits)
        self.classifier_name = 'decision_tree'
        self._criterion = criterion

    def get_classifier (self, X, y):
        print('===== Decision Tree Classifier =====')
        print('===== Hyperparameter tunning  =====')
        model = tree.DecisionTreeClassifier()
        params = {
            'criterion': ["gini", "entropy"],
            'max_depth': [10, 50, 100, None],
            'min_samples_split': [2, 10, 100],
            'class_weight': [None, 'balanced']
        }
        cfl = GridSearchCV(model, params, cv=5, scoring='recall')
        cfl.fit(X, y)
        for param, value in cfl.best_params_.items():
            print("%s : %s" % (param, value))

        model = tree.DecisionTreeClassifier(random_state=self._seed)
        model.set_params(**cfl.best_params_)
        return model


class SVMClassifier (SimpleClassifier):
    def __init__ (self, seed, n_splits=5):
        SimpleClassifier.__init__(self, seed, n_splits=n_splits)
        self.classifier_name = 'svm'

    def get_classifier (self, X, y):
        print('===== SVM Classifier =====')
        print('===== Hyperparameter tunning  =====')
        params = {
            #'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'kernel': ['linear', 'rbf'],
            'C': [1, 10, 100],
            #'degree': [1, 2, 3],
            #'coef0': [0, 10, 100],
            'tol': [0.001, 0.1, 1],
            'class_weight': ['balanced', None]
        }
        model = svm.SVC(random_state=self._seed, probability=True)
        cfl = GridSearchCV(model, params, cv=5, scoring='accuracy')
        cfl.fit(X, y)
        for param, value in cfl.best_params_.items():
            print("%s : %s" % (param, value))

        model = svm.SVC(random_state=self._seed, probability=True)
        model.set_params(**cfl.best_params_)
        return model


### Define fixed seed (42) and number of splits to initialize Classifiers

In [37]:
n_splits = 3

dt_classifier = DecisionTreeClassifier(seed=42, criterion='gini', n_splits=n_splits)
svm_classifier = SVMClassifier(42, n_splits=n_splits)

## Execute classifier for specific kN features


## Execute and store results in 'dataset_scores' dict

In [38]:
dataset_scores = dict()

#### DT classifier

In [39]:
# Execute DT classifier
dt_scores_dataset_kN, dt_y_pred_kN = dt_classifier.execute(dataset_training_kN_fs, dataset_testing_kN_fs)

# Add to dt scores
dataset_scores['decision_tree_scores'] = dt_scores_dataset_kN['decision_tree_scores']

Executing...
===== Decision Tree Classifier =====
===== Hyperparameter tunning  =====
class_weight : balanced
criterion : gini
max_depth : 50
min_samples_split : 10
OUR APPROACH F-measure: 0.7140883923117807 on average and 0.15136834067283308 SD
OUR APPROACH Precision: 0.7972222222222222 on average and 0.13949866143877465 SD
OUR APPROACH Recall: 0.7287749287749287 on average and 0.122585941649078 SD
Y_PRED #1: 507 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [40]:
dt = dt_scores_dataset_kN['decision_tree_scores']
print('exclusion_rate:', dt['exclusion_rate'])
print('threasholds:', dt['threasholds'])
print('missed:', dt['missed'])
print('fscore_threashold:', dt['fscore_threashold'])
print('exclusion_baseline:', dt['exclusion_baseline'])
print('missed_baseline:', dt['missed_baseline'])

exclusion_rate: [0.3181818181818182, 0.7368421052631579, 0.5]
threasholds: [0.5, 0.5, 0.5]
missed: [0.5384615384615384, 0.2, 0.6666666666666666]
fscore_threashold: [0.5714285714285714, 0.888888888888889, 0.5]
exclusion_baseline: [0.3181818181818182, 0.7368421052631579, 0.5]
missed_baseline: [0.5384615384615384, 0.2, 0.6666666666666666]


#### SVM classifier

In [41]:
# Execute SVM classifier
svm_scores_dataset_kN, svm_y_pred_kN = svm_classifier.execute(dataset_training_kN_fs, dataset_testing_kN_fs)

# Add to dt scores
dataset_scores['svm_scores'] = svm_scores_dataset_kN['svm_scores']

Executing...
===== SVM Classifier =====
===== Hyperparameter tunning  =====
C : 10
class_weight : balanced
kernel : rbf
tol : 1
OUR APPROACH F-measure: 1.0 on average and 0.0 SD
OUR APPROACH Precision: 1.0 on average and 0.0 SD
OUR APPROACH Recall: 1.0 on average and 0.0 SD
Y_PRED #1: 507 
 [0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [42]:
dt = svm_scores_dataset_kN['svm_scores']
print('exclusion_rate:', dt['exclusion_rate'])
print('threasholds:', dt['threasholds'])
print('missed:', dt['missed'])
print('fscore_threashold:', dt['fscore_threashold'])
print('exclusion_baseline:', dt['exclusion_baseline'])
print('missed_baseline:', dt['missed_baseline'])

exclusion_rate: [0.4090909090909091, 0.7368421052631579, 0.5]
threasholds: [0.5, 0.5, 0.5]
missed: [0.0, 0.0, 0.0]
fscore_threashold: [1.0, 1.0, 1.0]
exclusion_baseline: [0.4090909090909091, 0.7368421052631579, 0.5]
missed_baseline: [0.0, 0.0, 0.0]


### Compare results:

In [43]:
# DataFrame antes do FeatureSelection
df_training1 = pd.DataFrame.from_dict(filtered_texts_list_training)
df_testing1 = pd.DataFrame.from_dict(filtered_texts_list_testing)

df_training1.head(10) # Exemplo do estado do dataframe

Unnamed: 0,content,category,year
0,action research—a scientific approach ?,removido,1955
1,popper 's definition verisimilitude,removido,1974
2,assessment scientific merits action research .,removido,1978
3,evolution information technology : implication...,removido,1989
4,framework experience learning : essential natu...,removido,1991
5,assessment system software engineering scholar...,removido,1994
6,experimental software engineering : report sta...,removido,1995
7,design natural science research information te...,removido,1995
8,4+1 view model architecture,removido,1995
9,role experimentation software engineering : pa...,removido,1996


In [44]:
# DataFrame após resultado do FeatureSelection
df_training2 = pd.DataFrame.from_dict(dataset_training_kN_fs)
df_testing2 = pd.DataFrame.from_dict(dataset_testing_kN_fs)

df_training2.head(10) # Exemplo do estado do dataframe

Unnamed: 0,texts,features,categories,years
0,action research—a scientific approach ?,"(0, 522)\t0.3901063114456045\n (0, 523)\t0....",0,1955
1,popper 's definition verisimilitude,"(0, 341)\t0.408248290463863\n (0, 342)\t0.4...",0,1974
2,assessment scientific merits action research .,"(0, 592)\t0.32498264224349727\n (0, 593)\t0...",0,1978
3,evolution information technology : implication...,"(0, 145)\t0.28490072687831425\n (0, 812)\t0...",0,1989
4,framework experience learning : essential natu...,"(0, 151)\t0.19492098473333994",0,1991
5,assessment system software engineering scholar...,"(0, 135)\t0.1491318954486745\n (0, 581)\t0....",0,1994
6,experimental software engineering : report sta...,"(0, 135)\t0.14624651720866944\n (0, 464)\t0...",0,1995
7,design natural science research information te...,"(0, 503)\t0.2941056240844393\n (0, 504)\t0....",0,1995
8,4+1 view model architecture,"(0, 28)\t0.32342397373223813\n (0, 295)\t0....",0,1995
9,role experimentation software engineering : pa...,"(0, 135)\t0.12998867713662832\n (0, 330)\t0...",0,1996


In [45]:
# DataFrames para analisar DT e SVM após execução 
from copy import deepcopy

# DataFrame DT
# dt_df_training = pd.DataFrame.from_dict(dataset_training_kN_fs)
dt_df_testing = deepcopy(df_testing2)
dt_df_testing['predicted'] = dt_y_pred_kN

# DataFrame SVM
# svm_df_training = pd.DataFrame.from_dict(dataset_training_kN_fs)
svm_df_testing = deepcopy(df_testing2)
svm_df_testing['predicted'] = svm_y_pred_kN


In [46]:
df_testing = deepcopy(df_testing2)
df_testing['DT_pred'] = dt_y_pred_kN
df_testing['SVM_pred'] = svm_y_pred_kN


#### Decision Tree vs SVM

In [47]:
df_testing.head(20)

Unnamed: 0,texts,features,categories,years,DT_pred,SVM_pred
0,operations technology organizational structure .,"(0, 324)\t0.2944777476376821",0,1976,0,0
1,design science nested problem solve,"(0, 124)\t0.23131665679577518",0,2009,0,0
2,large-scale empirical study practitioners ' us...,"(0, 134)\t0.13016741554862618\n (0, 139)\t0...",0,2010,0,0
3,planning unknown : lessons learned ten months ...,"(0, 183)\t0.15822664177697937\n (0, 186)\t0...",1,2015,0,1
4,exploratory study technology transfer software...,"(0, 146)\t0.14072179760709141\n (0, 183)\t0...",1,2015,0,1
5,fast feedback cycles empirical software engine...,"(0, 121)\t0.2898421162832981\n (0, 122)\t0....",1,2015,0,1
6,integration se research industry : reflections...,"(0, 176)\t0.230029738282692\n (0, 226)\t0.2...",1,2015,0,1
7,towards approach matching cmd dsr improve acad...,"(0, 5)\t0.09142433180595995\n (0, 16)\t0.13...",0,2015,0,0
8,proposal using design science educational tech...,"(0, 124)\t0.17914005455415025\n (0, 353)\t0...",0,2015,0,0
9,university-industry collaboration open source ...,"(0, 77)\t0.09858311922841677\n (0, 243)\t0....",0,2015,0,0


In [48]:
# 'SVM_pred' or 'DT_pred'
def get_real_negatives(classifier_type: str):
    return df_testing.loc[(df_testing['categories'] == 0) & (df_testing['categories'] == df_testing[classifier_type]), ['texts', 'categories', classifier_type]]

def get_real_positives(classifier_type: str):
    return df_testing.loc[(df_testing['categories'] == 1) & (df_testing['categories'] == df_testing[classifier_type]), ['texts', 'categories', classifier_type]]

def get_false_negative(classifier_type: str):
    return df_testing.loc[(df_testing['categories'] == 1) & (df_testing[classifier_type] == 0), ['texts', 'categories', classifier_type]]

def get_false_positive(classifier_type: str):
    return df_testing.loc[(df_testing['categories'] == 0) & (df_testing[classifier_type] == 1), ['texts', 'categories', classifier_type]] 

#### DT analysis

In [49]:
cls_type = 'DT_pred'
df_dt_real_negatives = get_real_negatives(cls_type)
df_dt_real_positives = get_real_positives(cls_type)
df_dt_false_negative = get_false_negative(cls_type)
df_dt_false_positive = get_false_positive(cls_type)

In [50]:
print("RN",len(df_dt_real_negatives))
print("RP", len(df_dt_real_positives))
print("FN", len(df_dt_false_negative))
print("FP", len(df_dt_false_positive))

RN 466
RP 6
FN 29
FP 6


In [51]:
df_dt_real_positives

Unnamed: 0,texts,categories,DT_pred
78,researcher ’ experiences supporting industrial...,1,1
80,research framework build spi proposal small or...,1,1
159,meeting industry-academia research collaborati...,1,1
163,model-based testing digital tvs : industry-as-...,1,1
356,together stronger : evidence-based reflection ...,1,1
359,lessons learned research co-creation : making ...,1,1


#### SVM analysis

In [52]:
cls_type = 'SVM_pred'
df_svm_real_negatives = get_real_negatives(cls_type)
df_svm_real_positives = get_real_positives(cls_type)
df_svm_false_negative = get_false_negative(cls_type)
df_svm_false_positive = get_false_positive(cls_type)

In [53]:
print("RN",len(df_svm_real_negatives))
print("RP",len(df_svm_real_positives))
print("FN",len(df_svm_false_negative))
print("FP",len(df_svm_false_positive))

print("\n",df_svm_false_positive)

RN 470
RP 35
FN 0
FP 2

                                                  texts  categories  SVM_pred
433  guiding selection research methodology industr...           0         1
446  case study industry–academia communication joi...           0         1


In [54]:
df_svm_real_positives

Unnamed: 0,texts,categories,SVM_pred
3,planning unknown : lessons learned ten months ...,1,1
4,exploratory study technology transfer software...,1,1
5,fast feedback cycles empirical software engine...,1,1
6,integration se research industry : reflections...,1,1
78,researcher ’ experiences supporting industrial...,1,1
79,softcoder approach : promoting software engine...,1,1
80,research framework build spi proposal small or...,1,1
159,meeting industry-academia research collaborati...,1,1
160,industry-academia collaborations software engi...,1,1
161,industry -- academia collaboration software te...,1,1


In [55]:
print(df_svm_real_positives)


                                                 texts  categories  SVM_pred
3    planning unknown : lessons learned ten months ...           1         1
4    exploratory study technology transfer software...           1         1
5    fast feedback cycles empirical software engine...           1         1
6    integration se research industry : reflections...           1         1
78   researcher ’ experiences supporting industrial...           1         1
79   softcoder approach : promoting software engine...           1         1
80   research framework build spi proposal small or...           1         1
159  meeting industry-academia research collaborati...           1         1
160  industry-academia collaborations software engi...           1         1
161  industry -- academia collaboration software te...           1         1
162  serp-test : taxonomy support industry -- acade...           1         1
163  model-based testing digital tvs : industry-as-...           1         1

## Convert dataframe to CSV and send to drive

In [56]:
df_report = df_testing

unused_columns = ['features', 'years']
df_report.drop(unused_columns, inplace=True, axis=1)
df_report.rename(columns = {'categories':'Was Selected?'}, inplace = True)

with codecs.open(result_file_path, 'w', encoding='utf-8') as report_file:
    df_report.to_csv(report_file, index=False)

# END