In [2]:
import pandas as pd

In [3]:
train_df_raw = pd.read_json('data/train.json')
train_df_raw.head(5)

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,Children with autism spectrum disorder often e...,2014,CLPsych@ACL
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,We present and compare two alternative deep ne...,2018,Fig-Lang@NAACL-HLT
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,Visual Question Answering (VQA) methods aim at...,2021,ACL
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,Coreference resolution over semantic graphs li...,2022,ACL
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"In this paper, we present Linguistics Informed...",2019,FINDINGS


### A. Venue - year relation

code sources:
- https://stackoverflow.com/questions/35268817/unique-combinations-of-values-in-selected-columns-in-pandas-data-frame-and-count

In [4]:
df_year_venue = train_df_raw.groupby(['venue', 'year']).size().reset_index().rename(columns={0:'count'})
df_year_venue.head(15) 
print("A number of papers in different years of the 21st century have no venue assigned")

A number of papers in different years of the 21st century have no venue assigned


In [5]:
df_year_venue[15:20]

Unnamed: 0,venue,year,count
15,*SEMEVAL,2017,32
16,*SEMEVAL,2018,87
17,*SEMEVAL,2019,95
18,2015 7th International Conference on Informati...,2015,1
19,ACL,1979,7


In [6]:
len(train_df_raw['venue'].value_counts())
print("362 different unique venues")

362 different unique venues


In [7]:
# idea: create dictionary of years and venues which took place in such years (or the other way around) 

In [8]:
venue_list = list(train_df_raw['venue'].unique())


venue_id_dict = {}
venue_name_dict = {}
venue_year_dict = {}

for i in venue_list:
    df_moment = train_df_raw[train_df_raw['venue'] == i]
    venue_id_dict[f'{i}'] = [e for e in df_moment['authorId'].unique()]
    venue_name_dict[f'{i}'] = [e for e in df_moment['authorName'].unique()]
    venue_year_dict[f'{i}'] = [e for e in df_moment['year'].unique()]   

In [9]:
df_moment = train_df_raw[train_df_raw['venue'] == 'ACL']

In [10]:
venue_name_dict
venue_id_dict
venue_year_dict

{'CLPsych@ACL': [2014, 2017],
 'Fig-Lang@NAACL-HLT': [2018],
 'ACL': [2021,
  2022,
  1995,
  2020,
  1993,
  2019,
  1994,
  2017,
  2016,
  2018,
  2005,
  1981,
  2002,
  1999,
  1984,
  2015,
  1983,
  2006,
  2014,
  1985,
  2003,
  1990,
  1998,
  2004,
  2001,
  1997,
  1992,
  1987,
  1980,
  1996,
  1982,
  1979,
  1986,
  2000,
  1988,
  1989,
  1991,
  2010],
 'FINDINGS': [2019, 2021, 2020, 2022],
 'EMNLP': [2019,
  2021,
  2020,
  2018,
  2016,
  2014,
  2015,
  2017,
  2002,
  2000,
  2013,
  2022],
 'WMT@ACL': [2014],
 'Rep4NLP@ACL': [2017, 2018, 2016],
 'NLP4IF': [2021],
 'NAACL': [2018, 2020, 2016, 2021, 2022, 2019, 2015, 2017, 2014],
 'Computational Linguistics': [2008,
  2019,
  2018,
  2005,
  2014,
  2007,
  2015,
  2006,
  2021,
  2022,
  2020,
  2001,
  2002,
  2010,
  2012,
  2013,
  2009,
  2016,
  2011,
  2000,
  2004,
  2017,
  2003],
 'Proceedings of the Second Workshop on Storytelling': [2019],
 'CL': [2021,
  2010,
  2012,
  2015,
  2011,
  2017,
  2007,
  

In [11]:
venue_name_dict

{'CLPsych@ACL': ['Masoud Rouhizadeh',
  'H. A. Schwartz',
  'Yangfeng Ji',
  'E. Morley',
  'Kathleen C. Fraser',
  'Micah Iserman',
  'Melissa Roemmele',
  'Glen A. Coppersmith'],
 'Fig-Lang@NAACL-HLT': ['Yuri Bizzoni',
  'Chuhan Wu',
  'Malay Pramanick',
  'Kevin Stowe',
  'Michael Flor',
  'A. Mykowiecka',
  'Omnia Zayed',
  'Egon W. Stemle'],
 'ACL': ['Peter Vickers',
  'Irene Li',
  'Dekai Wu',
  'Valentin Hofmann',
  'D. Stallard',
  'Minghao Hu',
  'C. Samuelsson',
  'Sanja Štajner',
  'Sewon Min',
  'Adithya Renduchintala',
  'Hao Zheng',
  'Giovanni Campagna',
  'E. Muszyńska',
  'Dominik Schlechtweg',
  'Catherine Finegan-Dollak',
  'Zhiquan Ye',
  'Alexis Conneau',
  'Feng Nan',
  'Omid Bakhshandeh',
  'Ying Li',
  'Taku Kudo',
  'R. Bobrow',
  'Mikel Artetxe',
  'Masaru Isonuma',
  'Yuanhe Tian',
  'Tianyu Liu',
  'Danish Pruthi',
  'Paola Merlo',
  'Vikas Yadav',
  'Jierui Li',
  'S. Kurohashi',
  'V. Hatzivassiloglou',
  'Xinyu Wang',
  'A. Obamuyide',
  'Junru Zhou',
  '

In [12]:
venue_id_dict

{'CLPsych@ACL': [3188285,
  145035129,
  40608686,
  145272998,
  2022276,
  46198012,
  3316824,
  4366352],
 'Fig-Lang@NAACL-HLT': [2782720,
  15161448,
  40342933,
  31600242,
  144199707,
  1716891,
  3203390,
  2991702],
 'ACL': [144748442,
  46331602,
  2390150,
  1667898858,
  145482266,
  8367832,
  2403128,
  1952894,
  48872685,
  3286437,
  2115239329,
  1382048113,
  3449419,
  3449121,
  1403432982,
  2114132405,
  2480903,
  144647318,
  1814319,
  2154293431,
  1765329,
  2189985,
  2347956,
  24905917,
  151472012,
  1701889,
  7880098,
  143939590,
  143618944,
  9073069,
  1795664,
  1799688,
  47120498,
  22313325,
  30887404,
  28130078,
  1744669,
  3261470,
  32301760,
  1693525,
  2182290,
  1714374,
  2621022,
  9215251,
  2874038,
  2676559,
  3285152,
  11178191,
  2118801449,
  1390037280,
  1818919,
  41019330,
  35660331,
  2626599,
  48162772,
  2715551,
  2110406322,
  34837371,
  49506999,
  144481186,
  2616463,
  144653901,
  14487640,
  143643017,
  2

In [13]:
#See in which years the authors published papers
train_df_raw['year'] = train_df_raw['year'].astype(str)
groupby_authorId = train_df_raw.groupby('authorId').agg(',' .join).reset_index()
authorId_year = groupby_authorId[['authorId', 'year']]
authorId_year

Unnamed: 0,authorId,year
0,1678591,2004200419932019
1,1678747,201620002013
2,1678833,20172019
3,1679133,201520061998
4,1680075,2016
...,...,...
5620,2165302640,2022
5621,2166306973,2022
5622,2168628520,2022
5623,2169453878,2019


In [14]:
#See which authors published at which venues
authorId_venue = groupby_authorId[['authorId', 'venue']]
authorId_venue

Unnamed: 0,authorId,venue
0,1678591,"ACL,Computational Linguistics,ACL,NAACL"
1,1678747,"ACL,INLG,TACL"
2,1678833,"BSNLP@EACL,BSNLP@ACL"
3,1679133,"ACL,ACL,COLING-ACL"
4,1680075,SocialNLP@EMNLP
...,...,...
5620,2165302640,ACL
5621,2166306973,NAACL-HLT
5622,2168628520,SEMEVAL
5623,2169453878,ACL


In [15]:
#See in which years papers were published at which venues
groupby_venue = train_df_raw.groupby('venue').agg(',' .join).reset_index()
venue_year = groupby_venue[['venue', 'year']]
venue_year

  groupby_venue = train_df_raw.groupby('venue').agg(',' .join).reset_index()


Unnamed: 0,venue,year
0,,"2015,2015,2014,2015,2017,2018,2019,2017,2018,2..."
1,*SEM,201720172017201720172017201720172017
2,*SEM@NAACL-HLT,201520152015
3,*SEMEVAL,"2015,2016,2018,2015,2014,2015,2015,2014,2015,2..."
4,2015 7th International Conference on Informati...,2015
...,...,...
357,WNUT,"2020,2021,2021,2021,2021,2021,2020,2020,2021,2..."
358,WOAH,2021202120212021202120212021202020212021
359,WSSANLP@COLING,2014201420142014
360,WaC@EACL,20142014201420142014


In [16]:
#Show the years when the venue has published papers
#With removal of duplicates
venueyear = train_df_raw[['venue', 'year']]
unique_venue_year = venueyear.drop_duplicates()
unique_venue_year

groupby_venue = unique_venue_year.groupby('venue').agg(',' .join).reset_index()
unique_venueyear = groupby_venue[['venue', 'year']]
unique_venueyear

Unnamed: 0,venue,year
0,,2015201420172018201920162000200320112010
1,*SEM,2017
2,*SEM@NAACL-HLT,2015
3,*SEMEVAL,201520162018201420192017
4,2015 7th International Conference on Informati...,2015
...,...,...
357,WNUT,20202021
358,WOAH,20212020
359,WSSANLP@COLING,2014
360,WaC@EACL,2014
