In [None]:
# import sys
# sys.path.append(".")
# %cd "."
%reload_ext autoreload
%autoreload 2

## Parsing CORE data & FoR codes

In [None]:
import pandas as pd
import concurrent.futures
from conference_data_bot import parse_core_conf_data

total_pages = 20

# Use a ThreadPoolExecutor to run get_core_conf_data concurrently
with concurrent.futures.ThreadPoolExecutor() as executor:
    core_conf_data_list = list(
        executor.map(parse_core_conf_data, list(range(1, total_pages+1)))
    )

core_conf_data = (pd.concat(core_conf_data_list)
                    .sort_values(by='Title')
                    .reset_index(drop=True))

In [None]:
core_conf_data['Acronym'] = core_conf_data.Acronym.str.upper()
core_conf_data['Title'] = core_conf_data.Title.str.upper()

In [None]:
print(core_conf_data.shape)

In [None]:
from conference_data_bot import parse_core_for_data

for_code_level1_data, for_code_level2_data = parse_core_for_data()

In [None]:
core_conf_data["Field Name"] = core_conf_data["Primary FoR"].map({**dict(for_code_level1_data.to_records(index=False)),
                                                                  **dict(for_code_level2_data.to_records(index=False))})


In [None]:
print(core_conf_data.shape)

In [None]:
core_acronym_change = {
    'HCI': 'BCSHCI',
    'FST&TCS': 'FSTTCS',
    'IEEE CCNC': 'CCNC',
    'IEEE HPCS': 'HPCS',
    'IEEE ICIA': 'ICIA',
    'IEEE ADPRL': 'ADPRL',
    'IEEE ALIFE': 'ALIFE',
    'SP': 'S&P',
    'IFIP SEC': 'SEC',
    'USENIX-SECURITY': 'SECURITY',
    "CT-RSA": "RSA",
    'EVOCOP':'ECCO',
    
}

core_conf_data['Acronym'] = core_conf_data.Acronym.replace(core_acronym_change)

In [None]:
core_conf_data.to_excel("core_conf_data.xlsx", index=False)
core_conf_data.to_pickle("core_conf_data.pickle")

## Parsing GGS data

In [None]:
from conference_data_bot import parse_ggs_data

ggs_data = parse_ggs_data()

print(ggs_data.shape)

In [None]:
import numpy as np

ggs_acronym_change = {
    'DIMEA': 'ACE',
    'ASP-DAC': 'ASPDAC',
    'ACE-AUS': 'ACE',
    'BCS-HCI': 'BCSHCI',
    'BIG DATA': 'BIGDATA',
    'ICCI*CC': 'ICCI',
    'CHPSN': 'ICCHP',
    'SEQAPP': 'SETA',
    'CADE': 'IJCAR',
    'COIN': 'COINE',
    'I3D': 'I3DG',
    'CSEE&T': 'CSEET'
}

ggs_data.loc[ggs_data.Title=='PRAGUE STRINGOLOGY CONFERENCE', 'Acronym'] = 'PSC'
ggs_data['Acronym'] = ggs_data.Acronym.replace(ggs_acronym_change)

In [None]:
print(ggs_data.shape)
ggs_data.to_excel("ggs_data.xlsx", index=False)
ggs_data.to_pickle("ggs_data.pickle")

In [None]:
set(core_conf_data.Acronym.values)-set(ggs_data.Acronym.values)

In [None]:
pd.set_option('max_colwidth', 1)

(core_conf_data[core_conf_data.Acronym.isin(['CADE',
 'CSEET',
 'CT-RSA',
 'ECML PKDD',
 'EUROPAR',
 'EVOCOP',
 'FSE',
 'HOTCHIPS (HCS)',
 'I3DG',
 'ICSME',
 'IH&MMSEC',
 'ITNAC',
 'KES AMSTA',
 'MODSIM',
 'PETRI NETS',
 'SEMANTICS',
 'VISSOFT',
 'WI'])]
 [['Title', 'Acronym']]
#  .merge(ggs_data[['Title', 'Acronym']],
#         how='inner',
#         on='Title')
# [['Title', 'Acronym']]
)#.to_dict(orient='records')


In [None]:
ggs_data.merge(core_conf_data[['Acronym', 'Source', 'Rank', 'DBLP', 'Primary FoR', 'Average Rating', 'Field Name']],
              how='left',
              left_on='Acronym',
              right_on='Acronym')

## Parsing WikiCFP data

In [None]:
import string
from itertools import chain
from conference_data_bot import parse_wikicfp_programs_by_index

# Use a ThreadPoolExecutor to run parse_wikicfp_programs concurrently
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Use map function to apply parse_wikicfp_programs to all elements of list(string.ascii_uppercase)
    wikicfp_program_list = list(
        executor.map(parse_wikicfp_programs_by_index, list(string.ascii_uppercase))
    )

wikicfp_program_data = (
    pd.DataFrame({"program_link": list(chain(*wikicfp_program_list))})
    .program_link.replace({"s=": "", "f=": ""}, regex=True)
    .str.split("&", n=3, expand=True)
    .rename(columns={0: "program_link", 1: "acronym", 2: "program_name"})
)

In [None]:
wikicfp_ggs = wikicfp_program_data.merge(ggs_data,
                                         how='inner',
                                         left_on="acronym",
                                         right_on="Acronym")

In [None]:
print(wikicfp_ggs.shape)
wikicfp_ggs.to_excel("wikicfp_ggs.xlsx", index=False)
wikicfp_ggs.to_pickle("wikicfp_ggs.pickle")

In [None]:
wikicfp_ggs_event_detail = wikicfp_ggs[['program_link', 'Title', 'Acronym']].copy()

In [None]:
from tqdm import tqdm
from conference_data_bot import parse_wikicfp_program_events, parse_wikicfp_event_detail

for idx, row in tqdm(wikicfp_ggs_event_detail.iterrows(),
                     total=wikicfp_ggs_event_detail.shape[0],
                     desc=f'Parsing event detail: '):
    try:
        program_event_list = parse_wikicfp_program_events(
            wikicfp_ggs_event_detail.loc[idx, 'program_link'])

        with concurrent.futures.ThreadPoolExecutor() as executor:
            wikicfp_event_detail_list = list(
                executor.map(parse_wikicfp_event_detail, program_event_list)
            )
            wikicfp_ggs_event_detail.loc[idx, 'all_event_detail'] = pd.DataFrame(
                wikicfp_event_detail_list).to_json(orient='records')
    except Exception as e:
        print(idx, row.program_link, e)

In [None]:
wikicfp_ggs_event_detail

In [None]:
print(wikicfp_ggs_event_detail.shape)
wikicfp_ggs_event_detail.to_excel("wikicfp_ggs_event_detail.xlsx", index=False)
wikicfp_ggs_event_detail.to_pickle("wikicfp_ggs_event_detail.pickle")

In [None]:
all_dfs = []

for idx, row in wikicfp_ggs_event_detail.iterrows():
    df1 = pd.DataFrame([row[['program_link', 'Title', 'Acronym']]])
    df2 = pd.read_json(row.all_event_detail)
    if len(df2)==0:
        df2 = pd.DataFrame({'When':None,
                  'Where':None,
                  'Abstract Registration Due':None,
                  'Submission Deadline':None,
                  'Notification Due':None,
                  'Final Version Due':None,
                  'Link':None}, index=[0])
    df1_replicated = pd.concat([df1] * len(df2), ignore_index=True)
    all_dfs.append(pd.concat([pd.concat([df1] * len(df2), ignore_index=True),
                              df2],
                             axis=1))

In [None]:
all_events_df = pd.concat(all_dfs, axis=0, ignore_index=True)

In [179]:
from datetime import datetime

all_events_df.merge(wikicfp_ggs[['program_link', 'Title', 'Acronym',
                                 'GGS Class', 'GGS Rating', 
                                 'CORE-Best Class', 'CORE-All Classes']],
                    on=['program_link', 'Title', 'Acronym'], how='left').to_excel(f"dump_{datetime.now().strftime('%d%m%Y-%H%M%S')}.xlsx", index=False)

In [None]:
pd.read_json(wikicfp_ggs_event_detail
             .query("Acronym=='ISDA'")
             .all_event_detail
             .values[0])

In [None]:
wikicfp_ggs_event_detail_final = wikicfp_ggs.merge(wikicfp_ggs_event_detail,
                                                   how='left',
                                                   left_on=['Title', 'Acronym', 'program_link'],
                                                   right_on=['Title', 'Acronym', 'program_link'])

In [None]:
print(set(core_conf_data.Acronym.str.upper()).difference(set(wikicfp_ggs_event_detail_final.acronym.values)))

# print(len(set(wikicfp_ggs_event_detail_final.acronym.values).difference(set(core_conf_data.Acronym.values))))

In [None]:
core_conf_data