# Initial Setup

In [None]:
#### Imports
import csv
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from matplotlib.ticker import FuncFormatter
%config InlineBackend.figure_format = 'png'
%matplotlib inline

#### To ensure tables show all columns (or most of them)
pd.options.display.max_columns = 100;

#### Style of plots
sns.set_style("white")

#### Custom parameters
tablePath = './SLR_NFR_CLASSIFICATION.xlsx'

studyDataSum = pd.read_excel(tablePath, sheet_name='selected_papers_summary')

#studyDataEval = pd.read_excel(tablePath, sheet_name='selected_papers_evaluation')
studyDataAlg = pd.read_excel(tablePath, sheet_name='selected_papers_algorithms')

### Renaming some blank collumns
#studyDataEval.rename(columns={list(studyDataEval)[2]:'Type'},inplace=True)
studyDataAlg.rename(columns={list(studyDataAlg)[2]:'AlgorithmType'},inplace=True)

# Functions for DataVisualization

In [None]:
def showMostUsedNFR():
  nfrFrequencyRow = studyDataSum['NFR(s)'].str.split(',').apply(Counter) ##Counting occurrences for each item in it's line/row
  totalFrequency = nfrFrequencyRow.sum() ##Summing all occurences to get the total accross all studies
  nfrFreq_df = pd.DataFrame.from_dict(totalFrequency, orient='index', columns=['Times Mentioned'])
 
  nfrFreq_df['NFR'] = nfrFreq_df.index
  nfrFreq_df.reset_index()
  
  ### Plotting
  nfrFreq_df.sort_values(by='Times Mentioned',ascending=False,inplace=True)

  plt.figure(figsize=(7, 5))
  sns.set_theme(style='white',font_scale=0.95)
  ax = sns.barplot(data=nfrFreq_df,x='Times Mentioned',y='NFR')
  ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
  ax.bar_label(ax.containers[0])
  ax.tick_params(axis='x',pad=-3)
  ax.tick_params(axis='y',pad=-3)
  sns.despine()
  plt.show()

def correlationHeatMapNFR():
  nfrFrequencyRow = dict(studyDataSum['NFR(s)'].str.split(',').apply(Counter)) ##Counting occurrences for each item in it's line/row as a dict
  nfrFreq_df = pd.DataFrame.from_dict(nfrFrequencyRow,orient='index') ## Creating Dataframe based on it and orienting it so each NFR is a column for each study with the number of occurences in that study as it's value
  nfrFreq_df = nfrFreq_df.fillna(0) ## Filling with 0 all NaN so they count as that NFR not having any occurences in that study
  matrix_corr = nfrFreq_df.corr()

  mask = np.triu(np.ones_like(matrix_corr, dtype=bool))
  ##Plotting
  plt.figure(figsize=(15, 8))
  sns.set_theme(style='white',font_scale=0.9)
  heatmap = sns.heatmap(matrix_corr,mask=mask, vmin=-1, vmax=1, annot=True,annot_kws={"fontsize":9}, cmap='BrBG')
  heatmap.set_title("Correlation between all studied NFR's", fontdict={'fontsize':12}, pad=8);
  heatmap.tick_params(axis='x',pad=-3)
  heatmap.tick_params(axis='y',pad=-3)
  plt.show()

def showMostUsedDatasets(colorText):
  datasetFrequencyRow = studyDataSum['Dataset: Name/Reference'].str.split(',').apply(Counter)
  totalDatasetFrequency = datasetFrequencyRow.sum()
  datasetFreq_df = pd.DataFrame.from_dict(totalDatasetFrequency, orient='index', columns=['frequency'])
  datasetFreq_df['Dataset'] = datasetFreq_df.index
  datasetFreq_df.reset_index()
  datasetFreq_df = datasetFreq_df[datasetFreq_df.Dataset != 'Not Specified']

  color = sns.color_palette(colorText)

  ### Plotting
  datasetFreq_df.sort_values(by='frequency',ascending=False,inplace=True)
  plt.pie(datasetFreq_df['frequency'],colors=color,autopct=autopct_format(datasetFreq_df['frequency']), explode=(datasetFreq_df['frequency'] == max(datasetFreq_df['frequency'])) *0.05)
  plt.legend(title ="Datasets",
          loc ="center left",
          bbox_to_anchor =(1, 0, 0.5, 1),labels=datasetFreq_df['Dataset'])
  plt.show()


def autopct_format(values):
    def my_format(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{v:d}'.format(v=val)
    return my_format

def showMostUsedAlgorithms(): #Gambiarra Alert
  studyDataAlgTypes = studyDataAlg['AlgorithmType']
  studyDataAlgTypes = studyDataAlgTypes.fillna(method='ffill')
  key = ''
  counter = 1
  resultDict ={}
  datalist = list(studyDataAlgTypes)
  key = str(datalist[0])

  for i in range(len(datalist)):
    if i < len(datalist)-1:
      if datalist[i+1] == key:
        counter+=1
      else:
        resultDict[key] = counter
        key = datalist[i+1]
        counter = 1

  studyAlg_df = pd.DataFrame.from_dict(resultDict,orient='index', columns=['Nº of papers implementing a technique'])
  studyAlg_df['Technique'] = studyAlg_df.index
  studyAlg_df.reset_index()

  ### Plotting
  studyAlg_df.sort_values(by='Nº of papers implementing a technique',ascending=False,inplace=True)
  sns.set_theme(style='white',font_scale=0.95)
  ax = sns.barplot(data=studyAlg_df,x='Nº of papers implementing a technique',y='Technique')
  ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
  ax.bar_label(ax.containers[0])
  ax.tick_params(axis='x',pad=-3)
  ax.tick_params(axis='y',pad=2)
  sns.despine()
  plt.show()

# Applying Functions

In [None]:
showMostUsedNFR()

In [None]:
correlationHeatMapNFR()

In [None]:
showMostUsedDatasets("tab20")

In [None]:
showMostUsedAlgorithms()