# Snowballing

## Imports and Constants

In [None]:
import pandas as pd
import os
from functools import reduce
import numpy as np

IDENTIFIER = 'Identifier'
DOI = 'DOI'
TITLE = 'Title'
ABSTRACT = 'Abstract'
AUTHORS = 'Authors'
PUBLISHED = 'Published'
PUBLISHED_IN = 'Published_In'
SOURCE = 'Source'
RESOLVED_DOI = 'Resolved DOI'
ABBREVATION = 'Abbrevation'
RANK = 'Rank'

## Read DOIs

In [None]:
def hasDOI(entry):
    doiExists = entry[DOI] == entry[DOI]    
    identifer = entry[IDENTIFIER]
    if not doiExists:
        print(identifer + ' has no DOI')
    return doiExists

input_dois_pd = pd.read_csv('./input_dois.csv')
input_dois = input_dois_pd.T.to_dict().values()
input_dois = list(filter(hasDOI, input_dois))

print()
print('loaded ' + str(len(input_dois)) + ' DOIs')

## Filter Input DOIs

In [None]:
from lib.forward import get_forward_references

MAX_PAPERS_PER_METRIC = 9999

def get_selected_identifiers(row): 
    if(row.name == IDENTIFIER or row.name == TITLE or row.name == DOI):
        return None
    selected_cells = row.apply(lambda c: c == 'x' or c == 'X')
    selected_indices = np.where(selected_cells)[0].tolist()
    selected_identifiers = list(map(lambda i: identifiers[i], selected_indices))
    return selected_identifiers

df = input_dois_pd.T
identifiers = df.iloc[[0]].values[0]
papers_per_metric_pd = df.apply(get_selected_identifiers, axis=1).to_frame()
papers_per_metric_pd = papers_per_metric_pd[papers_per_metric_pd[0].notnull()]

def get_doi(identifier):
    return input_dois_pd[input_dois_pd[IDENTIFIER].str.match(identifier)][DOI].values[0]

def filter_most_used(papers, amount=MAX_PAPERS_PER_METRIC):
    identifier_amount = []
    for paper in papers:
        doi = str(get_doi(paper))
        if (doi == 'nan'):
            continue
        fr = get_forward_references({
            IDENTIFIER: paper,
            DOI: doi
        })
        cited_by = len(fr)
        identifier_amount.append({
            IDENTIFIER: paper,
            'Cited_By': cited_by
        })
    identifier_amount.sort(reverse=True, key=lambda e: e['Cited_By'])
    identifier_amount = list(map(lambda e: e[IDENTIFIER], identifier_amount))
    return identifier_amount[0:amount]

most_used_papers_per_metric_pd = papers_per_metric_pd.apply(lambda r: r.apply(filter_most_used))
most_used_papers = list(reduce(list.__add__, most_used_papers_per_metric_pd[0].values))

number_before_filtering = len(input_dois)
input_dois = list(filter(lambda d:d[IDENTIFIER] in most_used_papers, input_dois))
number_after_filtering = len(input_dois)

print()
print(str(number_after_filtering) + ' papers remain after filtering (' + str(number_before_filtering) + ')')
# Proceedings of the 28th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering

# Backwards Snowballing

In [None]:
from lib.backward import get_backward_references

backward_references = list(map(get_backward_references, input_dois))

number_of_backward_references = len(backward_references)
number_with_no_references = len(list(filter(lambda e: e is None, backward_references)))

backward_references = list(filter(lambda e: e is not None, backward_references))

print()
print('loaded ' + str(number_of_backward_references) + ' backward reference entries, ' + str(number_with_no_references) + ' of them had no references')

## Resolve DOIs

In [None]:
from lib.backward import get_by_doi

# extract backward references with dois
backward_dois = list(map(lambda e: e['dois'], backward_references))
backward_dois = list(reduce(list.__add__, backward_dois))
backward_dois.sort()

backward_references_from_dois = list(map(get_by_doi, backward_dois))
backward_references_from_dois = list(filter(lambda b: b is not None, backward_references_from_dois))

print()
print('loaded '+ str(len(backward_references_from_dois)) + ' entries')


## Load references from bib files

In [None]:
from lib.backward import read_bib_file
from lib.general import get_published_in

bib_entries = []
for root, dirs, files in os.walk("../papers"):
    for file in files:
        if file.endswith(".bib"):
            es = read_bib_file(root, file)
            bib_entries.extend(es)

backward_references_bib_df = pd.DataFrame([e for e in bib_entries])
backward_references_bib_df = backward_references_bib_df.loc[:, ['title', 'author', 'date', 'doi', 'referenced_by']]
backward_references_bib_df.columns = [TITLE, AUTHORS, PUBLISHED, DOI, 'Referenced_By']
backward_references_bib_df = backward_references_bib_df[backward_references_bib_df.Title.notnull()]
backward_references_bib_df = backward_references_bib_df[backward_references_bib_df.Authors.notnull()]
backward_references_bib_df[PUBLISHED_IN] = backward_references_bib_df[TITLE].map(get_published_in)

print()
print('loaded ' + str(len(backward_references_bib_df.index)) + ' bib-entries')

## List references without DOI

In [None]:
import json

references_without_dois = list(filter(lambda e: e is not None, backward_references))
references_without_dois = list(map(lambda e: e['non_dois'], references_without_dois))
references_without_dois = list(reduce(list.__add__, references_without_dois))

with open('intermediate/backward_references_without_dois.json', 'w') as outfile:
    json.dump(references_without_dois, outfile, indent=4)

print()
print('Have found '+ str(len(references_without_dois)) + ' references without dois')

## Finish backward snowballing

In [None]:
TITLE = 'Title'

backward_references_df = pd.DataFrame(backward_references_from_dois)
backward_references_df = backward_references_df.append(backward_references_bib_df)

number_before_filtering = len(backward_references_df.index)

backward_references_df = backward_references_df[backward_references_df[TITLE].str.match('.*(test|mutant|mutation|coverage).*', case=False)==True]
backward_references_df.drop_duplicates(subset=[TITLE], inplace=True)

backward_references_df = backward_references_df.sort_values(TITLE)
backward_references_df.to_csv('intermediate/backward.csv', index = False, header=True)

number_after_filtering = len(backward_references_df.index)

print()
print(str(number_after_filtering) + ' studies remain after filtering the backward references (' + str(number_before_filtering) + ')')

# Forwards Snowballing

In [None]:
import os
from functools import reduce
from lib.forward import set_scholarly_logging
from lib.forward import get_forward_references
from lib.general import get_published_in

set_scholarly_logging(False)
cited_by_group = list(map(get_forward_references, input_dois))

flat_map = lambda xs: reduce(lambda a, b: a + b, xs)
forward_references = list(map(lambda x: x['bib'], flat_map(cited_by_group)))

forward_references_df = pd.DataFrame(forward_references)
forward_references_df = forward_references_df.loc[:, ['title', 'abstract', 'author', 'pub_year']]
forward_references_df.columns = [TITLE, ABSTRACT, AUTHORS, PUBLISHED]
forward_references_df[PUBLISHED_IN] = forward_references_df[TITLE].map(get_published_in)

print()
print('loaded ' + str(len(forward_references)) + ' forward reference entries')

## Finish forward snowballing

In [None]:
number_before_filtering = len(forward_references_df.index)

forward_references_df = forward_references_df[forward_references_df[TITLE].str.match('.*(test|mutant|mutation|coverage).*')==True]
forward_references_df.drop_duplicates(subset=[TITLE], inplace=True)

forward_references_df = forward_references_df.sort_values(TITLE)
forward_references_df.to_csv('intermediate/forward.csv', index = False, header=True)

number_after_filtering = len(forward_references_df.index)

print()
print(str(number_after_filtering) + ' studies remain after filtering the forward references (' + str(number_before_filtering) + ')')

# Merge results

In [None]:
from lib.general import drop_duplicates_normalized

backward_pd = pd.read_csv('intermediate/backward.csv')
backward_pd[SOURCE] = 'backward'

forward_pd = pd.read_csv('intermediate/forward.csv')
forward_pd = forward_pd.loc[:, [TITLE, AUTHORS, PUBLISHED, PUBLISHED_IN]]
forward_pd[SOURCE] = 'forward'

all_stuides_pd = pd.read_csv('../all_studies.csv')
all_stuides_pd = all_stuides_pd.loc[:, ['Title', 'Author', 'Year']]
all_stuides_pd.columns = [TITLE, AUTHORS, PUBLISHED]
all_stuides_pd[SOURCE] = 'original'

result_pd = backward_pd.append(forward_pd)
result_pd = result_pd.append(all_stuides_pd)

drop_duplicates_normalized(result_pd, TITLE)
result_pd = result_pd[result_pd[SOURCE].str.match('.*(backward|forward).*')==True]

number_result = len(result_pd.index)

print()
print(str(number_result) + ' studies have been found after merging forward and backward snowballing')

# Filter Abstract

In [None]:
from lib.general import resolve_doi

abstracts_pd = pd.read_csv('/home/stefan/Desktop/jabref/abstracts.csv')
abstracts_dois = abstracts_pd.to_dict()['DOI'];
abstracts_texts = abstracts_pd.to_dict()['Abstract'];

abstracts = {}
for i in range(0, len(abstracts_dois)):
    doi = abstracts_dois[i]
    text = abstracts_texts[i]
    abstracts[doi] = text

def get_abstract(doi):
    if doi not in abstracts:
        return ''
    return abstracts[doi]

result_pd[RESOLVED_DOI] = result_pd[TITLE].map(resolve_doi)
result_pd[ABSTRACT] = result_pd[RESOLVED_DOI].map(get_abstract)

manuel_abstracts_pd = pd.read_csv('intermediate/manuel_abstract.csv')
manuel_abstracts_pd = manuel_abstracts_pd.loc[:, [TITLE, ABSTRACT]]
manuel_abstracts = manuel_abstracts_pd.to_dict(orient='records')
def find_abstract(title):
    if title is None:
        return ''
    records = list(filter(lambda r: r[TITLE] == title, manuel_abstracts))
    if len(records) < 1 or records[0][ABSTRACT] is np.NaN:
        return ''
    return records[0][ABSTRACT]
def take_abstract(entry):
    if entry[ABSTRACT] == '' or entry[ABSTRACT] is None:
        return find_abstract(entry[TITLE])
    return entry[ABSTRACT]
result_pd[ABSTRACT] = result_pd.apply(take_abstract, axis=1)

without_abstract_pd = result_pd[result_pd[ABSTRACT] == '']
without_abstract_pd['Skip'] = ''
without_abstract_pd = without_abstract_pd.loc[:, [TITLE, AUTHORS, 'Skip', ABSTRACT]]
number_without_abstract = len(without_abstract_pd)

without_abstract_pd.to_csv('intermediate/without_abstract.csv', index = False, header=True)

number_before_abstract_check = len(result_pd)
result_pd = result_pd[result_pd[ABSTRACT].str.match('.*(test suite).*')==True]
number_after_abstract_check = len(result_pd)

print()
print(str(number_after_abstract_check) + ' studies remain after abstract check (' + str(number_before_abstract_check) + ')')
print('for ' + str(number_without_abstract) + ' studies no abstract has been found')

# Filter For Credible Publications

In [None]:
import pandas as pd

RANKS = ['A', 'A*']

core_pd = pd.read_csv('core-sources/CORE.csv', header=None, usecols=[0,1,2,3,4,5,6,7])
core_pd = core_pd.loc[:, [1, 2, 4]]
core_pd.columns = [TITLE, ABBREVATION, RANK]
core_conferences = core_pd.to_dict(orient='records')
core_conferences = list(filter(lambda c: c[RANK] in RANKS, core_conferences))
core_conferences_abbrevations = list(map(lambda c: c[ABBREVATION], core_conferences))

core_journals_pd = pd.read_csv('core-sources/CORE_journals.csv')
core_journals_pd = core_journals_pd.loc[:, ['title', 'rank']]
core_journals_pd.columns = [TITLE, RANK]
core_journals = core_journals_pd.to_dict(orient='records')
core_journals = list(filter(lambda c: c[RANK] in RANKS, core_journals))
core_journals_title = list(map(lambda c: c[TITLE], core_journals))

def is_in_core(published_in):
    if type(published_in) != str:
        return False 
    matching_conferences = list(filter(lambda a: a in published_in, core_conferences_abbrevations))
    if len(matching_conferences) > 0:
        return True
    matching_titles = list(filter(lambda t: t == published_in, core_journals_title))
    if len(matching_titles) > 0:
        return True
    return False

number_before_cred_check = len(result_pd)
result_pd = result_pd[result_pd[PUBLISHED_IN].map(is_in_core)]
number_after_cred_check = len(result_pd)

print()
print(str(number_after_cred_check) + ' studies remain after credibility check (' + str(number_before_cred_check) + ')')

# Store Results

In [None]:
result_pd = result_pd.sort_values(TITLE)

result_pd = result_pd.loc[:, [TITLE, PUBLISHED_IN, ABSTRACT]]
result_pd.to_csv('result.csv', index = False, header=True)

number_result = len(result_pd.index)

print()
print(str(number_result) + ' studies have been found through snowballing')

# Read Results

In [None]:
import pandas as pd

INDEX = 'Index'

result_selected_pd = pd.read_csv('result_checked.csv')

number_of_studies = len(result_selected_pd.index)
result_selected_pd = result_selected_pd[result_selected_pd['USE'] == True]
number_of_selected_studies = len(result_selected_pd.index)

result_selected_pd[INDEX] = result_selected_pd.index.map(lambda i: 'snow' + str(i))
result_selected_pd = result_selected_pd.loc[:, [INDEX, TITLE, PUBLISHED_IN, ABSTRACT]]

result_selected_pd.to_csv('result_final.csv', index=False, header=True)

print(str(number_of_selected_studies) + ' studies remain after snowballing manual check (' + str(number_of_studies) + ')')