In [1]:
import pandas as pd
import ast

import glob
import os

In [2]:
csv_files = glob.glob('outputs/processed_outputs/*_processed.csv')  # replace with your actual directory path
csv_files

['outputs/processed_outputs/qiita_processed.csv',
 'outputs/processed_outputs/clinepidb_processed.csv',
 'outputs/processed_outputs/accessclinicaldata_processed.csv',
 'outputs/processed_outputs/dryad_processed.csv',
 'outputs/processed_outputs/immport_processed.csv',
 'outputs/processed_outputs/harvard_processed.csv',
 'outputs/processed_outputs/vdjserver_processed.csv',
 'outputs/processed_outputs/hubmap_processed.csv',
 'outputs/processed_outputs/datadiscoveryengine_processed.csv',
 'outputs/processed_outputs/lincs_processed.csv',
 'outputs/processed_outputs/mendeley_processed.csv',
 'outputs/processed_outputs/ncbi_geo_processed.csv',
 'outputs/processed_outputs/ncbi_sra_processed.csv',
 'outputs/processed_outputs/omicsdi_processed.csv',
 'outputs/processed_outputs/reframedb_processed.csv',
 'outputs/processed_outputs/veupathdb_processed.csv',
 'outputs/processed_outputs/vivli_processed.csv',
 'outputs/processed_outputs/zenodo_processed.csv']

In [3]:
repositories = {
    'accessclinicaldata': 'AccessClinicalData@NIAID',
    'clinepidb': 'ClinEpiDB',
    'datadiscoveryengine': 'Data Discovery Engine',
    'dryad': 'Dryad Digital Repository',
    'harvard': 'Harvard Dataverse',
    'hubmap': 'HuBMAP',
    'immport': 'ImmPort',
    'lincs': 'LINCS',
    'mendeley': 'Mendeley',
    'ncbi_geo':'NCBI GEO',
    'ncbi_sra': 'NCBI SRA',
    'omicsdi': 'Omics Discovery Index (OmicsDI)',
    'qiita': 'Qiita',
    'reframedb': 'ReframeDB',
    'vdjserver': 'VDJServer',
    'veupathdb': 'VEuPathDB',
    'vivli': 'Vivli',
    'zenodo': 'Zenodo'
}

In [4]:
# Read each CSV file, add a 'filename' column, and store them in a list
dfs = [pd.read_csv(f,lineterminator='\n').assign(Repository=repositories[os.path.basename(f).replace('_processed.csv', '')]) for f in csv_files]

# Concatenate all dataframes in the list into one dataframe
combined_df = pd.concat(dfs, ignore_index=True)

In [5]:
combined_df['Predictions'] = combined_df['Predictions'].apply(ast.literal_eval)

In [6]:
combined_df

Unnamed: 0,Name,Description,Model,Predictions,Hallucinations,filepath,_id,Repository
0,Understanding Cultivar-Specificity and Soil De...,This is a preliminary study to examine the mic...,gpt-3.5-turbo,"{Plant biology, Microbiology, Agricultural sci...",{'Soil science'},outputs/qiita_processed.csv,qiita_1001,Qiita
1,The microbiome of uncontacted Amerindians,Most studies of the human microbiome have focu...,gpt-3.5-turbo,"{Antimicrobial Resistance, Microbiology, Metag...",set(),outputs/qiita_processed.csv,qiita_10052,Qiita
2,Changes in Microbial Ecology after Fecal Micro...,Gut microbiota play a key role in maintaining ...,gpt-3.5-turbo,"{Infectious disease, Gastroenterology, Microbi...",set(),outputs/qiita_processed.csv,qiita_10057,Qiita
3,Fleshner tMM45,We hypothesize the wheel running may change th...,gpt-3.5-turbo,"{Biology, Microbiology, Animal study}",set(),outputs/qiita_processed.csv,qiita_10064,Qiita
4,The Fecal Microbial Community of Breast-fed In...,Oligosaccharides from breast milk are believed...,gpt-3.5-turbo,"{Microbiology, Genetics, Environmental sciences}",set(),outputs/qiita_processed.csv,qiita_10080,Qiita
...,...,...,...,...,...,...,...,...
2764275,IN00034 Kahaum Pillar Inscription of the Time ...,"Bhandarkar, Devadatta Ramakrishna, Bahadur Cha...",gpt-3.5-turbo,{},"{'History', 'Inscriptions', 'Archaeology'}",outputs/zenodo_processed.csv,ZENODO_998956,Zenodo
2764276,Infant Sibling Project: Sample Files,These electroencephalography (EEG) data files ...,gpt-3.5-turbo,"{Data integration and warehousing, Electroence...",set(),outputs/zenodo_processed.csv,ZENODO_998964,Zenodo
2764277,IN00037 Supia Pillar Inscription of the Time o...,"Bhandarkar, Devadatta Ramakrishna, Bahadur Cha...",gpt-3.5-turbo,{},"{'Geography', 'History', 'Archaeology'}",outputs/zenodo_processed.csv,ZENODO_998987,Zenodo
2764278,KC0P 40 m SEQP Reverse Beacon Network Log of S...,Log of the Spots generated by the Reverse Beac...,gpt-3.5-turbo,"{Computational biology, Data mining, Machine l...",set(),outputs/zenodo_processed.csv,ZENODO_999020,Zenodo


In [18]:
# empty_sets = combined_df.groupby('Repository')['Predictions'].apply(lambda x: (x == set()).sum())
# empty_sets

counts = combined_df.groupby('Repository')['Predictions'].apply(lambda x: pd.Series({
    'empty_sets': (x == set()).sum(),
    'sets_of_1': (x.apply(len) == 1).sum(),
    'sets_of_2': (x.apply(len) == 2).sum(),
    'sets_of_3': (x.apply(len) == 3).sum(),
    'total': x.size
}))

In [19]:
print(counts.to_string())

Repository                                 
AccessClinicalData@NIAID         empty_sets          0
                                 sets_of_1           0
                                 sets_of_2           0
                                 sets_of_3           7
                                 total               7
ClinEpiDB                        empty_sets          0
                                 sets_of_1           1
                                 sets_of_2           6
                                 sets_of_3          43
                                 total              50
Data Discovery Engine            empty_sets          0
                                 sets_of_1           2
                                 sets_of_2          17
                                 sets_of_3         349
                                 total             368
Dryad Digital Repository         empty_sets        180
                                 sets_of_1         897
                     

### Get Name and Descriptions with Word Threshold

In [21]:
# Calculate the total number of words in the 'Name' and 'Description' columns
word_counts = combined_df.apply(lambda x: len(str(x['Name']).split()) + len(str(x['Description']).split()), axis=1)

# Get rows where the total number of words is in the range 11-20
sample_rows_10 = combined_df[word_counts <= 11].sample(n=25)

# Get rows where the total number of words is in the range 11-20
sample_rows_11_20 = combined_df[(word_counts >= 11) & (word_counts <= 20)].sample(n=25)

# Get rows where the total number of words is in the range 21-30
sample_rows_21_30 = combined_df[(word_counts >= 21) & (word_counts <= 30)].sample(n=25)

# Get rows where the total number of words is in the range 31-40
sample_rows_31_40 = combined_df[(word_counts >= 31) & (word_counts <= 40)].sample(n=25)

sample_rows_10, sample_rows_11_20, sample_rows_21_30, sample_rows_31_40

(                                                      Name  \
 81751                                       hou02665c01004   
 2408106          Lactiplantibacillus plantarum strain:IRG1   
 1507968                      Listeria monocytogenes SHL015   
 985408                                                 NaN   
 433746   Sequencing INFLUENZA A VIRUS (A/MEMPHIS/3/1973...   
 1021938                                   Escherichia coli   
 1738566                                                NaN   
 1578369                   Cryptococcus neoformans A1-LIM_1   
 88709                                        A07_16248.JPG   
 1653718                                 biofilm metagenome   
 446627                                HMP reference genome   
 1581473           Sphingomonas parapaucimobilis NBRC 15100   
 2695495           Mass spectrometry imaging (MSI) datasets   
 1529806                Arabidopsis thaliana cultivar:Col-0   
 1657875                                Populus trichoc

In [22]:
sample_rows_10 = sample_rows_10.assign(identifier='10 or less')
sample_rows_11_20 = sample_rows_11_20.assign(identifier='11-20')
sample_rows_21_30 = sample_rows_21_30.assign(identifier='21-30')
sample_rows_31_40 = sample_rows_31_40.assign(identifier='31-40')

sample_rows = pd.concat([sample_rows_10, sample_rows_11_20, sample_rows_21_30, sample_rows_31_40], ignore_index=True)

In [24]:
sample_rows.rename(columns={'identifier': 'Number of Words'}, inplace=True)

In [26]:
sample_rows.drop(columns=['Predictions', 'Hallucinations'], inplace=True)

In [29]:
sample_rows['Name'] = sample_rows['Name'].fillna('')
sample_rows['Description'] = sample_rows['Description'].fillna('')

In [30]:
sample_rows

Unnamed: 0,Name,Description,Model,filepath,_id,Repository,Number of Words
0,hou02665c01004,L,gpt-3.5-turbo,outputs/harvard_processed.csv,Dataverse_10.7910_DVN_BA39BT,Harvard Dataverse,10 or less
1,Lactiplantibacillus plantarum strain:IRG1,Lactobacillus plantarum IRG1 Genome sequencing...,gpt-3.5-turbo,outputs/omicsdi_processed.csv,PRJNA428424,Omics Discovery Index (OmicsDI),10 or less
2,Listeria monocytogenes SHL015,Listeria monocytogenes SHL015 Genome sequencin...,gpt-3.5-turbo,outputs/omicsdi_processed.csv,PRJNA218932,Omics Discovery Index (OmicsDI),10 or less
3,,Identifying SNPs in the D1RatMgh8 microsatteli...,gpt-3.5-turbo,outputs/omicsdi_processed.csv,PRJEB27381,Omics Discovery Index (OmicsDI),10 or less
4,Sequencing INFLUENZA A VIRUS (A/MEMPHIS/3/1973...,,gpt-3.5-turbo,outputs/ncbi_sra_processed.csv,NCBI_SRA_SRP206023,NCBI SRA,10 or less
...,...,...,...,...,...,...,...
95,Законы о бюджете города Москвы на 2014-2018 го...,Ведомственная структура расходов бюджета город...,gpt-3.5-turbo,outputs/zenodo_processed.csv,ZENODO_4539989,Zenodo,31-40
96,Effects of water temperature on gut microbiota...,To describe the impact of temperature on the g...,gpt-3.5-turbo,outputs/ncbi_sra_processed.csv,NCBI_SRA_SRP404127,NCBI SRA,31-40
97,Microbial community characteristics of the int...,Microbial community characteristics of the int...,gpt-3.5-turbo,outputs/ncbi_sra_processed.csv,NCBI_SRA_SRP367441,NCBI SRA,31-40
98,Data for Triple Bottom Line towards a holistic...,Artigos utilizados para revisão sistemática da...,gpt-3.5-turbo,outputs/mendeley_processed.csv,Mendeley_dz58cvs99d,Mendeley,31-40


In [31]:
sample_rows.to_csv('outputs/word_count_sample.csv', index=False)

## Get empty set IDs

Get IDs of data with no predictions

In [14]:
empty_set_ids = combined_df[combined_df['Predictions'].apply(len) == 0][['_id', 'Name', 'Description']]
empty_set_ids.reset_index(drop=True, inplace=True)


In [16]:
empty_set_ids.to_csv('outputs/empty_set_ids.csv', index=False)