In [None]:
# import sys
# sys.path.append(".")
# %cd "."
%reload_ext autoreload
%autoreload 2

## Parsing CORE data & FoR codes

In [None]:
import pandas as pd
import concurrent.futures
from conference_data_bot import parse_core_conf_data

total_pages = 20

# Use a ThreadPoolExecutor to run get_core_conf_data concurrently
with concurrent.futures.ThreadPoolExecutor() as executor:
    core_conf_data_list = list(
        executor.map(parse_core_conf_data, list(range(1, total_pages+1)))
    )

core_conf_data = (pd.concat(core_conf_data_list)
                    .sort_values(by='Title')
                    .reset_index(drop=True))

In [50]:
core_conf_data['Acronym'] = core_conf_data.Acronym.str.upper()
core_conf_data['Title'] = core_conf_data.Title.str.upper()

In [6]:
print(core_conf_data.shape)

(956, 9)


In [8]:
from conference_data_bot import parse_core_for_data

for_code_level1_data, for_code_level2_data = parse_core_for_data()

In [51]:
core_conf_data["Field Name"] = core_conf_data["Primary FoR"].map({**dict(for_code_level1_data.to_records(index=False)),
                                                                  **dict(for_code_level2_data.to_records(index=False))})


In [26]:
print(core_conf_data.shape)

(956, 10)


In [122]:
core_acronym_change = {
    'HCI': 'BCSHCI',
    'FST&TCS': 'FSTTCS',
    'IEEE CCNC': 'CCNC',
    'IEEE HPCS': 'HPCS',
    'IEEE ICIA': 'ICIA',
    'IEEE ADPRL': 'ADPRL',
    'IEEE ALIFE': 'ALIFE',
    'SP': 'S&P',
    'IFIP SEC': 'SEC',
    'USENIX-SECURITY': 'SECURITY',
    "CT-RSA": "RSA",
    'EVOCOP':'ECCO',
    
}

core_conf_data['Acronym'] = core_conf_data.Acronym.replace(core_acronym_change)

In [124]:
core_conf_data.to_excel("core_conf_data.xlsx", index=False)
core_conf_data.to_pickle("core_conf_data.pickle")

## Parsing GGS data

In [108]:
from conference_data_bot import parse_ggs_data

ggs_data = parse_ggs_data()

print(ggs_data.shape)

(2396, 42)


In [117]:
import numpy as np

ggs_acronym_change = {
    'DIMEA': 'ACE',
    'ASP-DAC': 'ASPDAC',
    'ACE-AUS': 'ACE',
    'BCS-HCI': 'BCSHCI',
    'BIG DATA': 'BIGDATA',
    'ICCI*CC': 'ICCI',
    'CHPSN': 'ICCHP',
    'SEQAPP': 'SETA',
    'CADE': 'IJCAR',
    'COIN': 'COINE',
    'I3D': 'I3DG',
    'CSEE&T': 'CSEET'
}

ggs_data.loc[ggs_data.Title=='PRAGUE STRINGOLOGY CONFERENCE', 'Acronym'] = 'PSC'
ggs_data['Acronym'] = ggs_data.Acronym.replace(ggs_acronym_change)

In [118]:
print(ggs_data.shape)
ggs_data.to_excel("ggs_data.xlsx", index=False)
ggs_data.to_pickle("ggs_data.pickle")

(2396, 42)


In [125]:
set(core_conf_data.Acronym.values)-set(ggs_data.Acronym.values)

{'CADE',
 'CSEET',
 'CT-RSA',
 'ECML PKDD',
 'EUROPAR',
 'EVOCOP',
 'FSE',
 'HOTCHIPS (HCS)',
 'I3DG',
 'ICSME',
 'IH&MMSEC',
 'ITNAC',
 'KES AMSTA',
 'MODSIM',
 'PETRI NETS',
 'SEMANTICS',
 'VISSOFT',
 'WI'}

In [135]:
pd.set_option('max_colwidth', 1)

(core_conf_data[core_conf_data.Acronym.isin(['CADE',
 'CSEET',
 'CT-RSA',
 'ECML PKDD',
 'EUROPAR',
 'EVOCOP',
 'FSE',
 'HOTCHIPS (HCS)',
 'I3DG',
 'ICSME',
 'IH&MMSEC',
 'ITNAC',
 'KES AMSTA',
 'MODSIM',
 'PETRI NETS',
 'SEMANTICS',
 'VISSOFT',
 'WI'])]
 [['Title', 'Acronym']]
#  .merge(ggs_data[['Title', 'Acronym']],
#         how='inner',
#         on='Title')
# [['Title', 'Acronym']]
)#.to_dict(orient='records')


Unnamed: 0,Title,Acronym
49,ACM-SIGRAPH INTERACTIVE 3D GRAPHICS AND GAMES,I3DG
151,"CONFERENCE ON SOFTWARE ENGINEERING EDUCATION AND TRAINING (PREVIOUSLY CONFERENCE IS SOFTWARE ENGINEERING EDUCATION, CSEE, CHANGED IN 1997)",CSEET
156,CRYPTOGRAPHERS TRACK AT RSA CONFERENCE,CT-RSA
193,EUROPEAN CONFERENCE ON EVOLUTIONARY COMPUTATION IN COMBINATORIAL OPTIMISATION,EVOCOP
196,EUROPEAN CONFERENCE ON MACHINE LEARNING AND PRINCIPLES AND PRACTICE OF KNOWLEDGE DISCOVERY IN DATABASE (PKDD AND ECML COMBINED FROM 2008),ECML PKDD
301,"IEEE INTERNATIONAL CONFERENCE ON SOFTWARE MAINTENANCE AND EVOLUTION (PRIOR TO 2014 WAS ICSM, IEEE INTERNATIONAL CONFERENCE ON SOFTWARE MAINTENANCE)",ICSME
329,IEEE INTERNATIONAL WORKING CONFERENCE ON SOFTWARE VISUALISATION,VISSOFT
367,IEEE/WIC/ACM INTERNATIONAL CONFERENCE ON WEB INTELLIGENCE (PREVIOUSLY JOINT WITH INTELLIGENT AGENT TECHNOLOGY WI-IAT),WI
390,INFORMATION HIDING AND MULTIMEDIA SECURITY WORKSHOP,IH&MMSEC
455,INTERNATIONAL CONFERENCE ON AUTOMATED DEDUCTION,CADE


In [57]:
ggs_data.merge(core_conf_data[['Acronym', 'Source', 'Rank', 'DBLP', 'Primary FoR', 'Average Rating', 'Field Name']],
              how='left',
              left_on='Acronym',
              right_on='Acronym')

Unnamed: 0,Title,Acronym,GGS Class,GGS Rating,Qualified Classes,Collected Classes,All Qualified Classes,MA-Max. Field Rating,MA-Best Rank Field Rating,MA-Best Class Field Rating,...,LiveSHINE-All Citations,CORE-Best Class,CORE-All Classes,Num. of Input Records,Source,Rank,DBLP,Primary FoR,Average Rating,Field Name
0,"INTERNATIONAL CONFERENCE ON 3D IMAGING, MODELI...",3DIMPVT,Work in Progress,Work in Progress,MA:B-,B-,MA:[C|A],27.0,1230.0,C,...,,,,1,,,,,,
1,"INTERNATIONAL SYMPOSIUM ON 3D DATA PROCESSING,...",3DPVT,Work in Progress,Work in Progress,MA:B-,B-,MA:[C|A],41.0,654.0,C,...,,,,1,,,,,,
2,"3DTV-CONFERENCE: THE TRUE VISION - CAPTURE, TR...",3DTV-CON,Work in Progress,Work in Progress,MA:C,C,MA:[C|C],29.0,1098.0,C,...,,,,1,,,,,,
3,IEEE SYMPOSIUM ON 3D USER INTERFACES,3DUI,Work in Progress,Work in Progress,"LiveSHINE:B, MA:C","B, C","LiveSHINE:[B|C], MA:[C|B]",39.0,706.0,C,...,3808,,,2,,,,,,
4,INTERNATIONAL CONFERENCE ON 3D VISION,3DV,Work in Progress,Work in Progress,MA:A-,A-,MA:[B|A],46.0,540.0,B,...,,,,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2403,SYMPOSIUM ON CLOUD COMPUTING,,Work in Progress,Work in Progress,MA:B,B,MA:[B|B-],71.0,256.0,B,...,,,,1,,,,,,
2404,TRANS. COMPUTATIONAL SCIENCE,,Work in Progress,Work in Progress,MA:C,C,MA:[C|B-],33.0,918.0,C,...,,,,1,,,,,,
2405,VISUALIZATION IN SCIENCE AND EDUCATION,,Work in Progress,Work in Progress,CORE:NC,NC,CORE:[NC],,,,...,,NC,NC,1,,,,,,
2406,WEB INFORMATION SYSTEMS AND APPLICATIONS CONFE...,,Work in Progress,Work in Progress,LiveSHINE:B-,B-,LiveSHINE:[B-|C],,,,...,2768,,,1,,,,,,


## Parsing WikiCFP data

In [None]:
import string
from itertools import chain
from conference_data_bot import parse_wikicfp_programs_by_index

# Use a ThreadPoolExecutor to run parse_wikicfp_programs concurrently
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Use map function to apply parse_wikicfp_programs to all elements of list(string.ascii_uppercase)
    wikicfp_program_list = list(
        executor.map(parse_wikicfp_programs_by_index, list(string.ascii_uppercase))
    )

wikicfp_program_data = (
    pd.DataFrame({"program_link": list(chain(*wikicfp_program_list))})
    .program_link.replace({"s=": "", "f=": ""}, regex=True)
    .str.split("&", n=3, expand=True)
    .rename(columns={0: "program_link", 1: "acronym", 2: "program_name"})
)

In [None]:
wikicfp_ggs = wikicfp_program_data.merge(ggs_data,
                                         how='inner',
                                         left_on="acronym",
                                         right_on="Acronym")

In [None]:
print(wikicfp_ggs.shape)
wikicfp_ggs.to_excel("wikicfp_ggs.xlsx", index=False)
wikicfp_ggs.to_pickle("wikicfp_ggs.pickle")

In [None]:
wikicfp_ggs_event_detail = wikicfp_ggs[['program_link', 'Title', 'Acronym']].copy()

In [None]:
from tqdm import tqdm
from conference_data_bot import parse_wikicfp_program_events, parse_wikicfp_event_detail

for idx, row in tqdm(wikicfp_ggs_event_detail.iterrows(),
                     total=wikicfp_ggs_event_detail.shape[0],
                     desc=f'Parsing event detail: '):
    try:
        program_event_list = parse_wikicfp_program_events(
            wikicfp_ggs_event_detail.loc[idx, 'program_link'])

        with concurrent.futures.ThreadPoolExecutor() as executor:
            wikicfp_event_detail_list = list(
                executor.map(parse_wikicfp_event_detail, program_event_list)
            )
            wikicfp_ggs_event_detail.loc[idx, 'all_event_detail'] = pd.DataFrame(
                wikicfp_event_detail_list).to_json(orient='records')
    except Exception as e:
        print(idx, row.program_link, e)

In [None]:
print(wikicfp_ggs_event_detail.shape)
wikicfp_ggs_event_detail.to_excel("wikicfp_ggs_event_detail.xlsx", index=False)
wikicfp_ggs_event_detail.to_pickle("wikicfp_ggs_event_detail.pickle")

In [None]:
pd.read_json(wikicfp_ggs_event_detail
             .query("Acronym=='ISDA'")
             .all_event_detail
             .values[0])

In [None]:
wikicfp_ggs_event_detail_final = wikicfp_ggs.merge(wikicfp_ggs_event_detail,
                                                   how='left',
                                                   left_on=['Title', 'Acronym', 'program_link'],
                                                   right_on=['Title', 'Acronym', 'program_link'])

In [None]:
print(set(core_conf_data.Acronym.str.upper()).difference(set(wikicfp_ggs_event_detail_final.acronym.values)))

# print(len(set(wikicfp_ggs_event_detail_final.acronym.values).difference(set(core_conf_data.Acronym.values))))

In [None]:
core_conf_data