## Required imports

In [331]:
# For plots.
from tkinter.messagebox import NO
from pip import main

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import plotly.io as pio
pio.kaleido.scope.mathjax = None

# For csv reading.
import pandas as pd

# For maths.
import numpy as np

# For fuzzy markers
import random

## Functions to retrieve journal and conference rankings

In [332]:
def retrieve_quartile(row, dic_quartiles_dfs):
    if row['Publication Source'] != 'Journal':
        return None
    title = row['Publication Title']
    sourceID = row['SourceID']
    year = row['Publication Year']
    if (year == 2022):
        return 'Not assigned'

    print(f'Title: {title}\nSourceID: {sourceID}')
    df_quartiles = dic_quartiles_dfs[f'{year}']

    df_title_contains = df_quartiles[df_quartiles['Sourceid'] == sourceID]

    if df_title_contains.empty:
        print('Empty!')
        return 'Not assigned'

    print(f'Article from year {year}')
    print(df_title_contains.iloc[0][f'SJR Best Quartile'])

    return df_title_contains.iloc[0]['SJR Best Quartile']

def retrieve_conference(row, dic_conferences_dfs):
    if row['Publication Source'] != 'Conference':
        return None

    title = row['Publication Title']
    conferenceID = row['ConferenceID']
    year = row['Publication Year']
    if (year == 2022):
        return 'Not assigned'

    print(f'Title: {title}\ConferenceID: {conferenceID}')
    df_conferences = dic_conferences_dfs[f'{year}']

    df_title_contains = df_conferences[df_conferences[0] == conferenceID]

    if df_title_contains.empty:
        print('Empty!')
        return 'Not assigned'

    print(f'Article from year {year}')
    print(df_title_contains.iloc[0][4])

    return df_title_contains.iloc[0][4]

# Retrieve jornal rankings from scimagojr databases

In [333]:
df_quality = pd.read_csv('QualityChecklists.csv', sep=',', decimal='.')

dic_scimagojr_dfs = {}
dic_scimagojr_dfs['2021'] = pd.read_csv(
    'scimagojr 2021.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2020'] = pd.read_csv(
    'scimagojr 2020.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2019'] = pd.read_csv(
    'scimagojr 2019.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2018'] = pd.read_csv(
    'scimagojr 2018.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2017'] = pd.read_csv(
    'scimagojr 2017.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2016'] = pd.read_csv(
    'scimagojr 2016.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2015'] = pd.read_csv(
    'scimagojr 2015.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2014'] = pd.read_csv(
    'scimagojr 2014.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2013'] = pd.read_csv(
    'scimagojr 2013.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2012'] = pd.read_csv(
    'scimagojr 2012.csv', sep=';', decimal=',', quotechar="\"")
dic_scimagojr_dfs['2011'] = pd.read_csv(
    'scimagojr 2011.csv', sep=';', decimal=',', quotechar="\"")

df_quality['SJCR Quartile'] = df_quality.apply(
    lambda row: retrieve_quartile(row, dic_scimagojr_dfs), axis=1).replace('-', np.nan)

Title: IEEE Transactions on Quantum Engineering
SourceID: nan
Empty!
Title: Neural Computing and Applications
SourceID: 24800.0
Article from year 2019
Q1
Title: Energies
SourceID: 62932.0
Article from year 2016
Q1
Title: International Journal of Intelligent Systems and Applications
SourceID: 21101021990.0
Empty!
Title: International Journal of Systems Assurance Engineering and Management
SourceID: 19700177002.0
Article from year 2020
Q3
Title: Applied Thermal Engineering
SourceID: 13688.0
Article from year 2020
Q1
Title: International Transactions on Electrical Energy Systems
SourceID: 21100241220.0
Article from year 2021
Q2
Title: Journal of Cloud Computing
SourceID: 21100383744.0
Article from year 2020
Q2
Title: Energy Conversion and Management
SourceID: 29372.0
Article from year 2020
Q1
Title: IEEE Transactions on Systems, Man, and Cybernetics: Systems
SourceID: 21100262320.0
Article from year 2013
Q1
Title: Frontiers in ICT
SourceID: 21100893766.0
Article from year 2017
-
Title: Fr

## Retrieve CORE rankings from CORE databases

In [334]:
dic_CORE_dfs = {}
dic_CORE_dfs['2021'] = pd.read_csv(
    'CORE 2021.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2020'] = pd.read_csv(
    'CORE 2020.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2019'] = pd.read_csv(
    'CORE 2020.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2018'] = pd.read_csv(
    'CORE 2018.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2017'] = pd.read_csv(
    'CORE 2017.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2016'] = pd.read_csv(
    'CORE 2017.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2015'] = pd.read_csv(
    'CORE 2017.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2014'] = pd.read_csv(
    'CORE 2014.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2013'] = pd.read_csv(
    'CORE 2013.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2012'] = pd.read_csv(
    'CORE 2013.csv', sep=',', quotechar="\"", header=None)
dic_CORE_dfs['2011'] = pd.read_csv(
    'CORE 2013.csv', sep=',', quotechar="\"", header=None)

df_quality['CORE Ranking'] = df_quality.apply(
    lambda row: retrieve_conference(row, dic_CORE_dfs), axis=1).replace('-', np.nan)

Title: Proceedings of the Genetic and Evolutionary Computation Conference Companion\ConferenceID: 556.0
Article from year 2018
A
Title: 15th ACM Conference on Recommender Systems, RecSys 2021, September 27, 2021  -  October 1, 2021\ConferenceID: 28.0
Article from year 2021
A
Title: 2020 International Conference for Emerging Technology, INCET 2020, June 5, 2020  -  June 7, 2020\ConferenceID: nan
Empty!
Title: nan\ConferenceID: nan
Empty!
Title: nan\ConferenceID: nan
Empty!
Title: nan\ConferenceID: nan
Empty!
Title: nan\ConferenceID: nan
Empty!
Title: 2019 IEEE Congress on Evolutionary Computation (CEC)\ConferenceID: 2061.0
Article from year 2019
B
Title: IEEE-International Conference On Advances In Engineering, Science And Management (ICAESM -2012)\ConferenceID: nan
Empty!
Title: 2020 Chinese Automation Congress (CAC)\ConferenceID: nan
Empty!
Title: 2019 2nd World Conference on Mechanical Engineering and Intelligent Manufacturing (WCMEIM)\ConferenceID: nan
Empty!
Title: 2019 Chinese Con

In [335]:
df = df_quality.groupby(by=['Publication Source']).size().reset_index(name='# of studies')

fig = px.bar(data_frame = df, x='Publication Source', y='# of studies', text_auto=True)
# fig.write_image('validation_journal.png')
fig.show()

## Validate study selection with distributions of studies according to their rankings

In [336]:
df_journals = df_quality[df_quality['Publication Source'] == 'Journal'] 

df = df_journals.groupby(by=['SJCR Quartile']).size().reset_index(name='# of studies')

fig = px.bar(data_frame = df, x='SJCR Quartile', y='# of studies', text_auto=True)
# fig.write_image('validation_journal.png')
fig.show()

In [337]:
df_conferences = df_quality[df_quality['Publication Source'] == 'Conference'] 

df = df_conferences.groupby(by=['CORE Ranking']).size().reset_index(name='# of studies')

fig = px.bar(data_frame = df, x='CORE Ranking', y='# of studies', text_auto=True, color_discrete_sequence=px.colors.qualitative.Set1)
# fig.write_image('validation_conference.png')
fig.show()

## Quality Assessment

In [338]:
fig = px.histogram(df_quality, x='SSTOTAL', range_x=[0, 1], range_y=[0, 80], nbins=10, text_auto=True, color_discrete_sequence=px.colors.qualitative.Set2)
fig.show()

mean = np.mean(df_quality['SSTOTAL'])
print(f'mean: {mean}')

std = np.std(df_quality['SSTOTAL'])
print(f'std: {std}')

variance = np.var(df_quality['SSTOTAL'])
print(f'variance: {variance}')

min = np.min(df_quality['SSTOTAL'])
print(f'min: {min}')

max = np.max(df_quality['SSTOTAL'])
print(f'max: {max}')

median = np.median(df_quality['SSTOTAL'])
print(f'median: {median}')


mean: 0.6139894180333335
std: 0.1480587069162181
variance: 0.021921380693702572
min: 0.142857143
max: 0.875
median: 0.625


In [339]:
fig = px.histogram(df_quality, x='SRTOTAL', range_x=[0, 1], range_y=[0, 80], nbins=10, text_auto=True, color_discrete_sequence=px.colors.qualitative.Set3)
fig.show()

mean = np.mean(df_quality['SRTOTAL'])
print(f'mean: {mean}')

std = np.std(df_quality['SRTOTAL'])
print(f'std: {std}')

variance = np.var(df_quality['SRTOTAL'])
print(f'variance: {variance}')

min = np.min(df_quality['SRTOTAL'])
print(f'min: {min}')

max = np.max(df_quality['SRTOTAL'])
print(f'max: {max}')

median = np.median(df_quality['SRTOTAL'])
print(f'median: {median}')

mean: 0.6657557997800002
std: 0.10142554408566723
variance: 0.010287140993073627
min: 0.230769231
max: 0.866666667
median: 0.666666667


In [340]:
df_fuzzed = df_quality.copy()

df_fuzzed['SSTOTAL'] = df_fuzzed.apply(
    lambda row: row['SSTOTAL'] + random.uniform(-0.02, 0.02), axis=1)

df_fuzzed['SRTOTAL'] = df_fuzzed.apply(
    lambda row: row['SRTOTAL'] + random.uniform(-0.02, 0.02), axis=1)

fig = px.scatter(x=df_fuzzed['SSTOTAL'], y=df_fuzzed['SRTOTAL'])

fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 1])

fig.add_shape(type='line',
              x0=0,
              y0=0,
              x1=1,
              y1=1,
              line=dict(color='Red',),
              xref='x',
              yref='y'
              )
fig.show()


correlate = np.correlate(df_quality['SSTOTAL'], df_quality['SRTOTAL'])
print(f'correlate: {correlate}')

correlate: [62.03031064]


In [341]:
from scipy.stats import spearmanr

cor, pval = spearmanr(a=df_quality['SSTOTAL'], b=df_quality['SRTOTAL'], alternative='greater')

print(f'cor: {cor}\npval: {pval}')

cor: 0.3030565071230454
pval: 8.173382781403534e-05


In [342]:
from scipy.stats import spearmanr

cor, pval = spearmanr(a=df_quality['SSTOTAL'], b=df_quality['SRTOTAL'], alternative='greater')

print(f'cor: {cor}\npval: {pval}')

cor: 0.3030565071230454
pval: 8.173382781403534e-05


In [343]:
def quartile_to_int(row):
    if row['SJCR Quartile'] == 'Q1':
        return 1
    if row['SJCR Quartile'] == 'Q2':
        return 2
    if row['SJCR Quartile'] == 'Q3':
        return 3
    if row['SJCR Quartile'] == 'Q4':
        return 4
    return 5

df_journals['int_quartile'] = df_journals.apply (lambda row: quartile_to_int(row), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [344]:
def core_ranking_to_int(row):
    if row['CORE Ranking'] == 'A':
        return 1
    if row['CORE Ranking'] == 'B':
        return 2
    if row['CORE Ranking'] == 'C':
        return 3
    return 4

df_conferences['int_core_ranking'] = df_conferences.apply (lambda row: core_ranking_to_int(row), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [345]:
from scipy.stats import spearmanr

cor, pval = spearmanr(a=df_journals['SSTOTAL'], b=df_journals['int_quartile'], alternative='less')

print(f'cor: {cor}\npval: {pval}')

cor: -0.15410591049532635
pval: 0.05316730394643939


In [346]:
from scipy.stats import spearmanr

cor, pval = spearmanr(a=df_journals['SRTOTAL'], b=df_journals['int_quartile'], alternative='less')

print(f'cor: {cor}\npval: {pval}')

cor: -0.177406007878982
pval: 0.031249812214979918


In [347]:
from scipy.stats import spearmanr

cor, pval = spearmanr(a=df_conferences['SSTOTAL'], b=df_conferences['int_core_ranking'], alternative='less')

print(f'cor: {cor}\npval: {pval}')

cor: -0.3770284813643129
pval: 0.011703861911782952


In [348]:
from scipy.stats import spearmanr

cor, pval = spearmanr(a=df_conferences['SRTOTAL'], b=df_conferences['int_core_ranking'], alternative='less')

print(f'cor: {cor}\npval: {pval}')

cor: -0.4044414647345252
pval: 0.007211456781739799


In [349]:
print(df_quality['Publication Source'])

0        Workshop
1      Conference
2         Journal
3      Conference
4      Conference
          ...    
145       Journal
146       Journal
147    Conference
148       Journal
149       Journal
Name: Publication Source, Length: 150, dtype: object


## Feature Analysis

### Publication year

In [350]:
df = df_quality.groupby(by=['Publication Year']).size().reset_index(name='# of studies')

fig = px.bar(data_frame = df, x='Publication Year', y='# of studies', text_auto=True)
# fig.write_image('validation_journal.png')
fig.show()

### ISIC Section

In [351]:
def isic_section_to_name(row):
    if row['ISIC Section'] == 'A':
        return 'Agriculture, forestry and fishing'
    if row['ISIC Section'] == 'B':
        return 'Mining and quarrying'
    if row['ISIC Section'] == 'C':
        return 'Manufacturing'
    if row['ISIC Section'] == 'D':
        return 'Electricity, gas, steam and air conditioning supply'
    if row['ISIC Section'] == 'E':
        return 'Water supply; sewerage, waste management and remediation activities'
    if row['ISIC Section'] == 'F':
        return 'Construction'
    if row['ISIC Section'] == 'G':
        return 'Wholesale and retail trade; repair of motor vehicles and motorcycles'
    if row['ISIC Section'] == 'H':
        return 'Transportation and storage'
    if row['ISIC Section'] == 'I':
        return 'Accommodation and food service activities'
    if row['ISIC Section'] == 'J':
        return 'Information and communication'
    if row['ISIC Section'] == 'K':
        return 'Financial and insurance activities'
    if row['ISIC Section'] == 'L':
        return 'Real estate activities'
    if row['ISIC Section'] == 'M':
        return 'Professional, scientific and technical activities'
    if row['ISIC Section'] == 'N':
        return 'Administrative and support service activities'
    if row['ISIC Section'] == 'O':
        return 'Public administration and defence; compulsory social security'
    if row['ISIC Section'] == 'P':
        return 'Education'
    if row['ISIC Section'] == 'Q':
        return 'Human health and social work activities'
    if row['ISIC Section'] == 'R':
        return 'Arts, entertainment and recreation'
    if row['ISIC Section'] == 'S':
        return 'Other service activities'
    if row['ISIC Section'] == 'T':
        return 'Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use'
    if row['ISIC Section'] == 'U':
        return 'Activities of extraterritorial organizations and bodies'
    return 'Not Assigned'

df_quality['ISIC Section Name'] = df_quality.apply (lambda row: isic_section_to_name(row), axis=1)

In [352]:
series = df_quality['ISIC Section Name'].value_counts()

fig = px.pie(series, values=series.values, names=series.index)
# fig.write_image('validation_journal.png')
fig.show()

print(df)

    Publication Year  # of studies
0               2011             3
1               2012             6
2               2013             3
3               2014             8
4               2015             6
5               2016            11
6               2017            11
7               2018             9
8               2019            20
9               2020            39
10              2021            29
11              2022             5


In [353]:
df_manufacturing = df_quality[df_quality['ISIC Section Name'] == 'Manufacturing']

def isic_division_to_name(row):
    if row['ISIC Division'] == 10:
        return 'Manufacture of food products'
    if row['ISIC Division'] == 11:
        return 'Manufacture of beverages'
    if row['ISIC Division'] == 12:
        return 'Manufacture of tobacco products'
    if row['ISIC Division'] == 13:
        return 'Manufacture of textiles'
    if row['ISIC Division'] == 14:
        return 'Manufacture of wearing apparel'
    if row['ISIC Division'] == 15:
        return 'Manufacture of leather and related products'
    if row['ISIC Division'] == 16:
        return 'Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials'
    if row['ISIC Division'] == 17:
        return 'Manufacture of paper and paper products'
    if row['ISIC Division'] == 18:
        return 'Printing and reproduction of recorded media'
    if row['ISIC Division'] == 19:
        return 'Manufacture of coke and refined petroleum products'
    if row['ISIC Division'] == 20:
        return 'Manufacture of chemicals and chemical products'
    if row['ISIC Division'] == 21:
        return 'Manufacture of pharmaceuticals, medicinal chemical and botanical products'
    if row['ISIC Division'] == 22:
        return 'Manufacture of rubber and plastics products'
    if row['ISIC Division'] == 23:
        return 'Manufacture of other non-metallic mineral products'
    if row['ISIC Division'] == 24:
        return 'Manufacture of basic metals'
    if row['ISIC Division'] == 25:
        return 'Manufacture of fabricated metal products, except machinery and equipment'
    if row['ISIC Division'] == 26:
        return 'Manufacture of computer, electronic and optical products'
    if row['ISIC Division'] == 27:
        return 'Manufacture of electrical equipment'
    if row['ISIC Division'] == 28:
        return 'Manufacture of machinery and equipment n.e.c.'
    if row['ISIC Division'] == 29:
        return 'Manufacture of motor vehicles, trailers and semi-trailers'
    if row['ISIC Division'] == 30:
        return 'Manufacture of other transport equipment'
    if row['ISIC Division'] == 31:
        return 'Manufacture of furniture'
    if row['ISIC Division'] == 32:
        return 'Other manufacturing'
    if row['ISIC Division'] == 33:
        return 'Repair and installation of machinery and equipment'
    return 'Not Assigned'

df_manufacturing['ISIC Division Name'] = df_manufacturing.apply (lambda row: isic_division_to_name(row), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [354]:
series = df_manufacturing['ISIC Division Name'].value_counts()

fig = px.pie(series, values=series.values, names=series.index)
# fig.write_image('validation_journal.png')
fig.show()

print(df)

    Publication Year  # of studies
0               2011             3
1               2012             6
2               2013             3
3               2014             8
4               2015             6
5               2016            11
6               2017            11
7               2018             9
8               2019            20
9               2020            39
10              2021            29
11              2022             5


In [355]:
df_repair = df_manufacturing[df_manufacturing['ISIC Division Name'] == 'Repair and installation of machinery and equipment']

def isic_group_to_name(row):
    if row['ISIC Group'] == 331:
        return 'Repair of fabricated metal products, machinery and equipment'
    if row['ISIC Group'] == 332:
        return 'Installation of industrial machinery and equipment'
    return 'Not Assigned'

df_repair['ISIC Group Name'] = df_repair.apply (lambda row: isic_group_to_name(row), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [356]:
series = df_repair['ISIC Group Name'].value_counts()

fig = px.pie(series, values=series.values, names=series.index)
# fig.write_image('validation_journal.png')
fig.show()

print(df)

    Publication Year  # of studies
0               2011             3
1               2012             6
2               2013             3
3               2014             8
4               2015             6
5               2016            11
6               2017            11
7               2018             9
8               2019            20
9               2020            39
10              2021            29
11              2022             5


In [357]:
df_repairfabricated = df_repair[df_repair['ISIC Group Name'] == 'Repair of fabricated metal products, machinery and equipment']

def isic_class_to_name(row):
    if row['ISIC Class'] == 3311:
        return 'Repair of fabricated metal products'
    if row['ISIC Class'] == 3312:
        return 'Repair of machinery'
    if row['ISIC Class'] == 3313:
        return 'Repair of electronic and optical equipment'
    if row['ISIC Class'] == 3314:
        return 'Repair of electrical equipment'
    if row['ISIC Class'] == 3315:
        return 'Repair of transport equipment, except motor vehicles'
    if row['ISIC Class'] == 3319:
        return 'Repair of other equipment'
    return 'Not Assigned'

df_repairfabricated['ISIC Class Name'] = df_repairfabricated.apply (lambda row: isic_class_to_name(row), axis=1)

In [358]:
series = df_repairfabricated['ISIC Class Name'].value_counts()

fig = px.pie(series, values=series.values, names=series.index)
# fig.write_image('validation_journal.png')
fig.show()

print(df)

    Publication Year  # of studies
0               2011             3
1               2012             6
2               2013             3
3               2014             8
4               2015             6
5               2016            11
6               2017            11
7               2018             9
8               2019            20
9               2020            39
10              2021            29
11              2022             5
