# Post-processing and Analysis of First Annotated Pass

In [2]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('./query-modified-masters/master_imwut_modified_20240610-153437.csv')
df.columns

Index(['Unnamed: 0', 'id', 'type', 'authors', 'title', 'year', 'issue_date',
       'publisher', 'address', 'volume', 'number', 'url', 'doi', 'abstract',
       'journal', 'month', 'articleno', 'numpages', 'keywords',
       'acm_classification+model8_Fulltext', 'acm_training+set_Fulltext',
       'acm_pretrain8_Fulltext', 'acm_existing+dataset8_Fulltext',
       'acm_LLM_AbstractOnly', 'acm_neural+network_AbstractOnly',
       'acm_review_AbstractOnly', 'acm_gradient+boosting_Fulltext',
       'acm_dataset_TitleOnly', 'acm_external+dataset8_Fulltext',
       'acm_pre-train8_Fulltext', 'acm_we+train8_Fulltext',
       'acm_we+collect8_AbstractOnly', 'acm_hyperparameter+tuning_Fulltext',
       'acm_regression+model8_Fulltext', 'acm_machine+learning_AbstractOnly',
       'acm_survey_TitleOnly', 'acm_datasets+used_Fulltext',
       'acm_LSTM_Fulltext', 'acm_decision+tree_Fulltext',
       'acm_publicly+available+data8_Fulltext',
       'acm_benchmark+dataset8_Fulltext', 'acm_SVM_Fulltext

In [18]:
# Print summary of how hits in each column for prelim stats
summary_df = pd.DataFrame({
    'Count': df[df.columns[19:]].sum(axis=0),
    '%': df[df.columns[19:]].mean(axis=0).map(lambda x: f"{x * 100:.2f}%")
})
summary_df

Unnamed: 0,Count,%
acm_classification+model8_Fulltext,133,9.47%
acm_training+set_Fulltext,374,26.64%
acm_pretrain8_Fulltext,10,0.71%
acm_existing+dataset8_Fulltext,17,1.21%
acm_LLM_AbstractOnly,5,0.36%
acm_neural+network_AbstractOnly,101,7.19%
acm_review_AbstractOnly,15,1.07%
acm_gradient+boosting_Fulltext,61,4.34%
acm_dataset_TitleOnly,11,0.78%
acm_external+dataset8_Fulltext,3,0.21%


In [19]:
# Number in each by category
# Datasets made for models
df['dataset_made_for_model'] = df['acm_dataset_TitleOnly'] | df['acm_we+collect8_AbstractOnly']
# NOTE: "we annotat*" returned 0 results when searching in abstract.

# ML model in the paper
ml_model_in_paper_cols = ['acm_classification+model8_Fulltext', 'acm_training+set_Fulltext', 
    'acm_we+train8_Fulltext', 'acm_LLM_AbstractOnly', 
    'acm_neural+network_AbstractOnly', 'acm_gradient+boosting_Fulltext', 
    'acm_hyperparameter+tuning_Fulltext', 'acm_SVM_Fulltext', 
    'acm_decision+tree_Fulltext', 'acm_LSTM_Fulltext', 
    'acm_validation+set_Fulltext', 'acm_regression+model8_Fulltext', 
    'acm_machine+learning_AbstractOnly']
df['ml_model_in_paper'] = df[ml_model_in_paper_cols].any(axis=1)

# Contains pretrained model
contains_pretrained_cols = ['acm_pretrain8_Fulltext', 'acm_pre-train8_Fulltext']
df['contains_pretrained'] = df['acm_pretrain8_Fulltext'] | df['acm_pre-train8_Fulltext']

# Trained on other data
trained_on_other_data_cols = [ 'acm_existing+dataset8_Fulltext', 'acm_external+dataset8_Fulltext', 
    'acm_benchmark+dataset8_Fulltext', 'acm_publicly+available+data8_Fulltext', 
    'acm_datasets+used_Fulltext']
df['trained_on_other_data'] = df[trained_on_other_data_cols].any(axis=1)

# Is a review
df['is_review'] = df['acm_review_AbstractOnly'] | df['acm_survey_TitleOnly']

In [27]:
# Print summary of how hits in each column for prelim stats
def print_summary_df(cols, df):
    summary_df = pd.DataFrame({
    'Count': df[cols].sum(axis=0),
    '%': df[cols].mean(axis=0).map(lambda x: f"{x * 100:.2f}%")
    })
    display(summary_df)


cols = ['dataset_made_for_model', 'ml_model_in_paper', 'contains_pretrained', 'trained_on_other_data', 'is_review']
print_summary_df(cols, df)

Unnamed: 0,Count,%
dataset_made_for_model,39,2.78%
ml_model_in_paper,803,57.19%
contains_pretrained,56,3.99%
trained_on_other_data,113,8.05%
is_review,17,1.21%


In [22]:
# Cumulatively, including overlap, there are about 800:
df[df[['dataset_made_for_model', 'ml_model_in_paper', 'contains_pretrained', 'trained_on_other_data']].any(axis=1) & ~ df['is_review']]

Unnamed: 0.1,Unnamed: 0,id,type,authors,title,year,issue_date,publisher,address,volume,...,acm_decision+tree_Fulltext,acm_publicly+available+data8_Fulltext,acm_benchmark+dataset8_Fulltext,acm_SVM_Fulltext,acm_validation+set_Fulltext,dataset_made_for_model,ml_model_in_paper,contains_pretrained,trained_on_other_data,is_review
1,1,10.1145/3610897,article,"Abul Al Arabi, Xue Wang, Yang Zhang, Jeeeun Kim",E3D: Harvesting Energy from Everyday Kinetic I...,2023,September 2023,Association for Computing Machinery,"New York, NY, USA",7,...,False,False,False,False,True,False,True,False,False,False
2,2,10.1145/3610891,article,"Riku Arakawa, Bing Zhou, Gurunandan Krishnan, ...",MI-Poser: Human Body Pose Tracking Using Magne...,2023,September 2023,Association for Computing Machinery,"New York, NY, USA",7,...,False,False,False,False,True,False,True,False,False,False
3,3,10.1145/3610889,article,"Sejal Bhalla, Salaar Liaqat, Robert Wu, Andrea...",PulmoListener: Continuous Acoustic Monitoring ...,2023,September 2023,Association for Computing Machinery,"New York, NY, USA",7,...,False,False,False,True,False,False,True,False,False,False
4,4,10.1145/3610896,article,"Sudershan Boovaraghavan, Prasoon Patidar, Yuvr...",TAO: Context Detection from Daily Activity Pat...,2023,September 2023,Association for Computing Machinery,"New York, NY, USA",7,...,False,False,False,True,False,False,True,False,False,False
5,5,10.1145/3610909,article,"Ling Chen, Rong Hu, Menghan Wu, Xin Zhou",HMGAN: A Hierarchical Multi-Modal Generative A...,2023,September 2023,Association for Computing Machinery,"New York, NY, USA",7,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1399,1399,10.1145/3659616,article,"Xiyuxing Zhang, Yuntao Wang, Yuxuan Han, Chen ...",The EarSAVAS Dataset: Enabling Subject-Aware V...,2024,May 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,False,True,True,True,False,True,False
1400,1400,10.1145/3659614,article,"Qian Zhang, Yubin Lan, Kaiyi Guo, Dong Wang",Lipwatch: Enabling Silent Speech Recognition o...,2024,May 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,False,False,False,True,False,False,False
1401,1401,10.1145/3659621,article,"Ran Zhu, Mingkun Yang, Qing Wang",ShuffleFL: Addressing Heterogeneity in Multi-D...,2024,May 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,False,False,False,True,False,True,False
1402,1402,10.1145/3659612,article,"Ziyu Wu, Fangting Xie, Yiran Fang, Zhen Liang,...",Seeing through the Tactile: 3D Human Shape Est...,2024,May 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,False,True,False,True,True,False,False


In [28]:
# Comparing to Volume 8, Issue 2 Known Stats
v8i1_df = df[(df['volume'] == 8) & (df['number'] == 1)]
print('Number of Articles:', len(v8i1_df))
print_summary_df(cols, v8i1_df)


Number of Articles: 41


Unnamed: 0,Count,%
dataset_made_for_model,2,4.88%
ml_model_in_paper,28,68.29%
contains_pretrained,3,7.32%
trained_on_other_data,8,19.51%
is_review,2,4.88%


In [38]:
print_summary_df(cols, v8i1_df[:20])

Unnamed: 0,Count,%
dataset_made_for_model,1,5.00%
ml_model_in_paper,16,80.00%
contains_pretrained,2,10.00%
trained_on_other_data,5,25.00%
is_review,1,5.00%


In [78]:
def print_summary_withindices_df(cols, df):
    summary_df = pd.DataFrame({
    'Count': df[cols].sum(axis=0),
    '%': df[cols].mean(axis=0).map(lambda x: f"{x * 100:.2f}%"),
    'Indices': df[cols].apply(lambda x: x.index[x].to_list())
    })
    display(summary_df)

print_summary_withindices_df(cols, v8i1_df[:20])

# Actual Paper Numbers
v8i1_df[:20][cols].apply(lambda x: (x.index[x]-1316).to_list()).to_list()

Unnamed: 0,Count,%,Indices
dataset_made_for_model,1,5.00%,[1335]
ml_model_in_paper,16,80.00%,"[1317, 1318, 1320, 1321, 1322, 1323, 1324, 132..."
contains_pretrained,2,10.00%,"[1326, 1333]"
trained_on_other_data,5,25.00%,"[1317, 1318, 1326, 1327, 1335]"
is_review,1,5.00%,[1320]


[[19],
 [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19],
 [10, 17],
 [1, 2, 10, 11, 19],
 [4]]

In [55]:
df[cols].apply(lambda x: x.index[x].to_list()

(5,)

In [62]:
v8i1_df

Unnamed: 0.1,Unnamed: 0,id,type,authors,title,year,issue_date,publisher,address,volume,...,acm_decision+tree_Fulltext,acm_publicly+available+data8_Fulltext,acm_benchmark+dataset8_Fulltext,acm_SVM_Fulltext,acm_validation+set_Fulltext,dataset_made_for_model,ml_model_in_paper,contains_pretrained,trained_on_other_data,is_review
1317,1317,10.1145/3643511,article,"Nafees Ahmad, Ho-fung Leung",HyperHAR: Inter-sensing Device Bilateral Corre...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,True,False,False,False,True,False,True,False
1318,1318,10.1145/3643541,article,"Leonardo Alchieri, Nouran Abdalazim, Lidia Ale...",Lateralization Effects in Electrodermal Activi...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,True,False,False,True,False,False,True,False,True,False
1319,1319,10.1145/3643513,article,"Shaikh Shawon Arefin Shimon, Ali Neshati, Junw...",Exploring Uni-manual Around Ear Off-Device Ges...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,False,False,False,False,False,False,False
1320,1320,10.1145/3643555,article,"Sizhen Bian, Mengxi Liu, Bo Zhou, Paul Lukowic...",Body-Area Capacitive or Electric Field Sensing...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,True,False,False,True,False,False,True,False,False,True
1321,1321,10.1145/3643547,article,"Wenqiang Chen, Shupei Lin, Zhencan Peng, Farsh...",ViObject: Harness Passive Vibrations for Daily...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,True,True,False,True,False,False,False
1322,1322,10.1145/3643503,article,"Kaijie Gong, Yi Gao, Wei Dong",Privacy-Preserving and Cross-Domain Human Sens...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,True,False,False,True,False,False,False
1323,1323,10.1145/3643516,article,"Zhizhang Hu, Amirmohammad Radmehr, Yue Zhang, ...",IOTeeth: Intra-Oral Teeth Sensing System for D...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,True,False,False,True,False,False,False
1324,1324,10.1145/3643559,article,"Zeyuan Huang, Cangjun Gao, Haiyan Wang, Xiaomi...",SpeciFingers: Finger Identification and Error ...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,False,True,False,True,False,False,False
1325,1325,10.1145/3643506,article,"Sungjin Hwang, Jiwoong Heo, Youngwug Cho, Juch...",Transportation Mode Detection Technology to Pr...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,True,False,False,False,False,False,True,False,False,False
1326,1326,10.1145/3643514,article,"Yonchanok Khaokaew, Hao Xue, Flora D. Salim",MAPLE: Mobile App Prediction Leveraging Large ...,2024,March 2024,Association for Computing Machinery,"New York, NY, USA",8,...,False,False,False,False,False,False,True,True,True,False


# Second Pass of Analysis, with new keywords and no asterisks

In [83]:
df = pd.read_csv('./query-modified-masters/master_imwut_modified_20240611-164920.csv')

# Only 1 bib entry, manually rename.
df['acm_generative+ai_AbstractOnly'] = df['acm_3643542']
df.columns

Index(['Unnamed: 0', 'id', 'type', 'authors', 'title', 'year', 'issue_date',
       'publisher', 'address', 'volume', 'number', 'url', 'doi', 'abstract',
       'journal', 'month', 'articleno', 'numpages', 'keywords',
       'acm_we+collect_Fulltext', 'acm_training+set_Fulltext',
       'acm_pretrain_Fulltext', 'acm_we+trained_Fulltext',
       'acm_we+annotated_Fulltext', 'acm_LLM_AbstractOnly',
       'acm_pre-trained_Fulltext', 'acm_datasets+[_Fulltext',
       'acm_neural+network_AbstractOnly', 'acm_benchmark+datasets_Fulltext',
       'acm_review_AbstractOnly', 'acm_publicly+available+data_Fulltext',
       'acm_regression+model_Fulltext', 'acm_gradient+boosting_Fulltext',
       'acm_dataset_TitleOnly', 'acm_benchmark+dataset_Fulltext',
       'acm_data+collection_Fulltext', 'acm_existing+datasets_Fulltext',
       'acm_external+dataset_Fulltext', 'acm_hyperparameter+tuning_Fulltext',
       'acm_pre-train_Fulltext', 'acm_widely+used+datasets_Fulltext',
       'acm_we+collected_F

In [84]:
# Number in each by category
# Datasets made for models
dataset_made_for_model_cols = ['acm_dataset_TitleOnly', 
                               'acm_we+collect_Fulltext',
                               'acm_we+collected_Fulltext',
                               'acm_we+annotate_Fulltext',
                               'acm_we+annotated_Fulltext',
                               'acm_dataset+collected_Fulltext',
                                'acm_data+collection_Fulltext'
                               ]
df['dataset_made_for_model'] = df[dataset_made_for_model_cols].any(axis=1)


# ML model in the paper
ml_model_in_paper_cols = ['acm_LLM_AbstractOnly',
                          'acm_generative+ai_AbstractOnly',
                        'acm_machine+learning_AbstractOnly', 
                        'acm_neural+network_AbstractOnly', 
                        'acm_LSTM_Fulltext', 
                        'acm_hyperparameter+tuning_Fulltext',
                        'acm_validation+set_Fulltext', 
                        'acm_training+set_Fulltext',
                        'acm_regression+model_Fulltext', 
                        'acm_classification+model_Fulltext',
                        'acm_decision+tree_Fulltext', 
                        'acm_gradient+boosting_Fulltext',
                        'acm_SVM_Fulltext',
                        'acm_we+train_Fulltext', 
                        'acm_we+trained_Fulltext',]
df['ml_model_in_paper'] = df[ml_model_in_paper_cols].any(axis=1)

# Contains pretrained model
contains_pretrained_cols = ['acm_pretrain_Fulltext',
                            'acm_pretrained_Fulltext',
                            'acm_pre-train_Fulltext',
                            'acm_pre-trained_Fulltext' ]
df['contains_pretrained'] = df[contains_pretrained_cols].any(axis=1)

# Trained on other data
trained_on_other_data_cols = ['acm_benchmark+dataset_Fulltext',
                                'acm_benchmark+datasets_Fulltext',
                                'acm_publicly+available+data_Fulltext',
                                'acm_publicly+available+dataset_Fulltext',
                                'acm_external+dataset_Fulltext',
                                'acm_external+datasets_Fulltext',
                                'acm_existing+dataset_Fulltext', 
                                'acm_existing+datasets_Fulltext',
                                'acm_widely+used+datasets_Fulltext',
                                'acm_dataset+[_Fulltext',
                                'acm_datasets+[_Fulltext',
                                'acm_datasets+used_Fulltext']
df['trained_on_other_data'] = df[trained_on_other_data_cols].any(axis=1)

# Is a review
is_review_cols = ['acm_survey_TitleOnly',
                  'acm_review_AbstractOnly']
df['is_review'] = df[is_review_cols].any(axis=1)

In [86]:
# Checking with the 20
v8i1_df = df[(df['volume'] == 8) & (df['number'] == 1)]
print_summary_withindices_df(cols, v8i1_df[:20])

# Actual Paper Numbers
v8i1_df[:20][cols].apply(lambda x: (x.index[x]-1316).to_list()).to_list()

Unnamed: 0,Count,%,Indices
dataset_made_for_model,17,85.00%,"[1317, 1318, 1319, 1320, 1321, 1322, 1323, 132..."
ml_model_in_paper,17,85.00%,"[1317, 1318, 1320, 1321, 1322, 1323, 1324, 132..."
contains_pretrained,6,30.00%,"[1317, 1321, 1324, 1326, 1333, 1336]"
trained_on_other_data,16,80.00%,"[1317, 1318, 1320, 1321, 1322, 1324, 1325, 132..."
is_review,1,5.00%,[1320]


[[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 19, 20],
 [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20],
 [1, 5, 8, 10, 17, 20],
 [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20],
 [4]]

Significantly an overestimate now. Next steps: tinker with keywords to try and pare down (are there combinations from the current columns that allow for the exact selection?)