In [27]:
import pandas as pd
import ast

import glob
import os

import re

In [2]:
csv_files = glob.glob('outputs/processed_outputs/*_processed.csv')  # replace with your actual directory path
csv_files

['outputs/processed_outputs/qiita_processed.csv',
 'outputs/processed_outputs/clinepidb_processed.csv',
 'outputs/processed_outputs/accessclinicaldata_processed.csv',
 'outputs/processed_outputs/dryad_processed.csv',
 'outputs/processed_outputs/immport_processed.csv',
 'outputs/processed_outputs/harvard_processed.csv',
 'outputs/processed_outputs/vdjserver_processed.csv',
 'outputs/processed_outputs/hubmap_processed.csv',
 'outputs/processed_outputs/datadiscoveryengine_processed.csv',
 'outputs/processed_outputs/lincs_processed.csv',
 'outputs/processed_outputs/mendeley_processed.csv',
 'outputs/processed_outputs/ncbi_geo_processed.csv',
 'outputs/processed_outputs/ncbi_sra_processed.csv',
 'outputs/processed_outputs/omicsdi_processed.csv',
 'outputs/processed_outputs/reframedb_processed.csv',
 'outputs/processed_outputs/veupathdb_processed.csv',
 'outputs/processed_outputs/vivli_processed.csv',
 'outputs/processed_outputs/zenodo_processed.csv']

In [3]:
repositories = {
    'accessclinicaldata': 'AccessClinicalData@NIAID',
    'clinepidb': 'ClinEpiDB',
    'datadiscoveryengine': 'Data Discovery Engine',
    'dryad': 'Dryad Digital Repository',
    'harvard': 'Harvard Dataverse',
    'hubmap': 'HuBMAP',
    'immport': 'ImmPort',
    'lincs': 'LINCS',
    'mendeley': 'Mendeley',
    'ncbi_geo':'NCBI GEO',
    'ncbi_sra': 'NCBI SRA',
    'omicsdi': 'Omics Discovery Index (OmicsDI)',
    'qiita': 'Qiita',
    'reframedb': 'ReframeDB',
    'vdjserver': 'VDJServer',
    'veupathdb': 'VEuPathDB',
    'vivli': 'Vivli',
    'zenodo': 'Zenodo'
}

In [4]:
# Read each CSV file, add a 'filename' column, and store them in a list
dfs = [pd.read_csv(f,lineterminator='\n').assign(Repository=repositories[os.path.basename(f).replace('_processed.csv', '')]) for f in csv_files]

# Concatenate all dataframes in the list into one dataframe
combined_df = pd.concat(dfs, ignore_index=True)

In [5]:
combined_df['Predictions'] = combined_df['Predictions'].apply(ast.literal_eval)

In [6]:
combined_df

Unnamed: 0,Name,Description,Model,Predictions,Hallucinations,_id,Repository
0,Understanding Cultivar-Specificity and Soil De...,This is a preliminary study to examine the mic...,gpt-3.5-turbo,"{Microbiology, Agricultural science, Plant bio...",{'Soil science'},qiita_1001,Qiita
1,The microbiome of uncontacted Amerindians,Most studies of the human microbiome have focu...,gpt-3.5-turbo,"{Microbiology, Metagenomics, Antimicrobial Res...",set(),qiita_10052,Qiita
2,Changes in Microbial Ecology after Fecal Micro...,Gut microbiota play a key role in maintaining ...,gpt-3.5-turbo,"{Infectious disease, Microbial ecology, Gastro...",set(),qiita_10057,Qiita
3,Fleshner tMM45,We hypothesize the wheel running may change th...,gpt-3.5-turbo,"{Microbiology, Biology, Animal study}",set(),qiita_10064,Qiita
4,The Fecal Microbial Community of Breast-fed In...,Oligosaccharides from breast milk are believed...,gpt-3.5-turbo,"{Microbiology, Genetics, Environmental sciences}",set(),qiita_10080,Qiita
...,...,...,...,...,...,...,...
2764275,IN00034 Kahaum Pillar Inscription of the Time ...,"Bhandarkar, Devadatta Ramakrishna, Bahadur Cha...",gpt-3.5-turbo,{},"{'History', 'Inscriptions', 'Archaeology'}",ZENODO_998956,Zenodo
2764276,Infant Sibling Project: Sample Files,These electroencephalography (EEG) data files ...,gpt-3.5-turbo,"{Electroencephalography, Data integration and ...",set(),ZENODO_998964,Zenodo
2764277,IN00037 Supia Pillar Inscription of the Time o...,"Bhandarkar, Devadatta Ramakrishna, Bahadur Cha...",gpt-3.5-turbo,{},"{'Geography', 'History', 'Archaeology'}",ZENODO_998987,Zenodo
2764278,KC0P 40 m SEQP Reverse Beacon Network Log of S...,Log of the Spots generated by the Reverse Beac...,gpt-3.5-turbo,"{Computational biology, Data mining, Machine l...",set(),ZENODO_999020,Zenodo


In [7]:
# empty_sets = combined_df.groupby('Repository')['Predictions'].apply(lambda x: (x == set()).sum())
# empty_sets

counts = combined_df.groupby('Repository')['Predictions'].apply(lambda x: pd.Series({
    'empty_sets': (x == set()).sum(),
    'sets_of_1': (x.apply(len) == 1).sum(),
    'sets_of_2': (x.apply(len) == 2).sum(),
    'sets_of_3': (x.apply(len) == 3).sum(),
    'total': x.size
}))

In [8]:
print(counts.to_string())

Repository                                 
AccessClinicalData@NIAID         empty_sets          0
                                 sets_of_1           0
                                 sets_of_2           0
                                 sets_of_3           7
                                 total               7
ClinEpiDB                        empty_sets          0
                                 sets_of_1           1
                                 sets_of_2           6
                                 sets_of_3          43
                                 total              50
Data Discovery Engine            empty_sets          0
                                 sets_of_1           2
                                 sets_of_2          17
                                 sets_of_3         349
                                 total             368
Dryad Digital Repository         empty_sets        180
                                 sets_of_1         897
                     

### Get Name and Descriptions with Word Threshold

In [9]:
# Calculate the total number of words in the 'Name' and 'Description' columns
word_counts = combined_df.apply(lambda x: len(str(x['Name']).split()) + len(str(x['Description']).split()), axis=1)

# Get rows where the total number of words is in the range 11-20
sample_rows_10 = combined_df[word_counts <= 11].sample(n=25)

# Get rows where the total number of words is in the range 11-20
sample_rows_11_20 = combined_df[(word_counts >= 11) & (word_counts <= 20)].sample(n=25)

# Get rows where the total number of words is in the range 21-30
sample_rows_21_30 = combined_df[(word_counts >= 21) & (word_counts <= 30)].sample(n=25)

# Get rows where the total number of words is in the range 31-40
sample_rows_31_40 = combined_df[(word_counts >= 31) & (word_counts <= 40)].sample(n=25)

sample_rows_10, sample_rows_11_20, sample_rows_21_30, sample_rows_31_40

(                                                      Name  \
 1300515                               Oryza longistaminata   
 1118694                                                NaN   
 87853                                        A12_20995.JPG   
 2631365                                       Library data   
 1595748                                    Solanum incanum   
 2339531                                       Homo sapiens   
 770017              Oryza sativa cultivar:Kitaake FN1669-S   
 2062403    Roseinatronobacter thiooxidans strain:DSM 13087   
 2062024                      Helicoverpa armigera armigera   
 2401013                    human gut metagenome strain:NO3   
 122335                                       A04_13972.JPG   
 87016                                        A12_22669.jpg   
 635859          Oryza sativa Kitaake FN1673-S Resequencing   
 108350                                       A92_5339a.jpg   
 1054489  ena-DATASET-KNIH-30-03-2021-05:47:45:371-459 

In [22]:
sample_rows_10 = sample_rows_10.assign(identifier='10 or less')
sample_rows_11_20 = sample_rows_11_20.assign(identifier='11-20')
sample_rows_21_30 = sample_rows_21_30.assign(identifier='21-30')
sample_rows_31_40 = sample_rows_31_40.assign(identifier='31-40')

sample_rows = pd.concat([sample_rows_10, sample_rows_11_20, sample_rows_21_30, sample_rows_31_40], ignore_index=True)

In [24]:
sample_rows.rename(columns={'identifier': 'Number of Words'}, inplace=True)

In [26]:
sample_rows.drop(columns=['Predictions', 'Hallucinations'], inplace=True)

In [29]:
sample_rows['Name'] = sample_rows['Name'].fillna('')
sample_rows['Description'] = sample_rows['Description'].fillna('')

In [30]:
sample_rows

Unnamed: 0,Name,Description,Model,filepath,_id,Repository,Number of Words
0,hou02665c01004,L,gpt-3.5-turbo,outputs/harvard_processed.csv,Dataverse_10.7910_DVN_BA39BT,Harvard Dataverse,10 or less
1,Lactiplantibacillus plantarum strain:IRG1,Lactobacillus plantarum IRG1 Genome sequencing...,gpt-3.5-turbo,outputs/omicsdi_processed.csv,PRJNA428424,Omics Discovery Index (OmicsDI),10 or less
2,Listeria monocytogenes SHL015,Listeria monocytogenes SHL015 Genome sequencin...,gpt-3.5-turbo,outputs/omicsdi_processed.csv,PRJNA218932,Omics Discovery Index (OmicsDI),10 or less
3,,Identifying SNPs in the D1RatMgh8 microsatteli...,gpt-3.5-turbo,outputs/omicsdi_processed.csv,PRJEB27381,Omics Discovery Index (OmicsDI),10 or less
4,Sequencing INFLUENZA A VIRUS (A/MEMPHIS/3/1973...,,gpt-3.5-turbo,outputs/ncbi_sra_processed.csv,NCBI_SRA_SRP206023,NCBI SRA,10 or less
...,...,...,...,...,...,...,...
95,Законы о бюджете города Москвы на 2014-2018 го...,Ведомственная структура расходов бюджета город...,gpt-3.5-turbo,outputs/zenodo_processed.csv,ZENODO_4539989,Zenodo,31-40
96,Effects of water temperature on gut microbiota...,To describe the impact of temperature on the g...,gpt-3.5-turbo,outputs/ncbi_sra_processed.csv,NCBI_SRA_SRP404127,NCBI SRA,31-40
97,Microbial community characteristics of the int...,Microbial community characteristics of the int...,gpt-3.5-turbo,outputs/ncbi_sra_processed.csv,NCBI_SRA_SRP367441,NCBI SRA,31-40
98,Data for Triple Bottom Line towards a holistic...,Artigos utilizados para revisão sistemática da...,gpt-3.5-turbo,outputs/mendeley_processed.csv,Mendeley_dz58cvs99d,Mendeley,31-40


In [31]:
sample_rows.to_csv('outputs/word_count_sample.csv', index=False)

## Get empty set IDs

Get IDs of data with no predictions

In [14]:
empty_set_ids = combined_df[combined_df['Predictions'].apply(len) == 0][['_id', 'Name', 'Description']]
empty_set_ids.reset_index(drop=True, inplace=True)


In [16]:
empty_set_ids.to_csv('outputs/empty_set_ids.csv', index=False)

### Get samples of specific repositories

In [15]:
# Define the word count ranges
word_count_ranges = [
    (0, 10),
    (11, 20),
    (21, 30),
    (31, 40)
]

# Initialize an empty list to store the samples
samples = []

# Iterate over each repository
for repository in combined_df['Repository'].unique():
    if repository not in ['Harvard Dataverse', 'Zenodo', 'Mendeley']:
        continue
    repository_df = combined_df[combined_df['Repository'] == repository]
    
    # Iterate over each word count range
    for word_count_range in word_count_ranges:
        min_count, max_count = word_count_range
        
        # Get the samples with the specified word count range
        samples_in_range = repository_df[
            (repository_df['Name'].str.split().str.len() + repository_df['Description'].str.split().str.len() >= min_count) &
            (repository_df['Name'].str.split().str.len() + repository_df['Description'].str.split().str.len() <= max_count)
        ]

        # Take a sample of 5 rows or all rows if there are fewer than 5
        samples_in_range = samples_in_range.sample(n=min(5, len(samples_in_range)))
        
        samples_in_range['Word Count Range'] = f'{min_count}-{max_count} words'

        # Append the samples to the list
        samples.append(samples_in_range)

# Concatenate the list of samples DataFrames into a single DataFrame
samples_df = pd.concat(samples)

samples_df

Unnamed: 0,Name,Description,Model,Predictions,Hallucinations,_id,Repository,Word Count Range
124630,Konzeptstudie über die Kompatibilität von Regi...,I,gpt-3.5-turbo,{},"{'Mechanical engineering', 'Aerospace engineer...",Dataverse_10.7910_DVN_UMKWCI,Harvard Dataverse,0-10 words
84529,A13_24998.jpg,Link to OCHRE database: http://pi.lib.uchicago...,gpt-3.5-turbo,"{Informatics, ""Data submission, annotation, an...",set(),Dataverse_10.7910_DVN_CJDNIE,Harvard Dataverse,0-10 words
126477,hou00201c00035,D,gpt-3.5-turbo,"{""Data architecture, analysis and design"", ""Da...",set(),Dataverse_10.7910_DVN_VH5JLZ,Harvard Dataverse,0-10 words
100504,A12_22337.JPG,Link to OCHRE database: http://pi.lib.uchicago...,gpt-3.5-turbo,"{Data integration and warehousing, Informatics...",set(),Dataverse_10.7910_DVN_JOAUYR,Harvard Dataverse,0-10 words
63299,Gent_2009_Peperstraat,Dendrochronological research project,gpt-3.5-turbo,"{Ecology, Biology, Environmental sciences}",{'Dendrochronological research project'},Dataverse_10.34894_JBLFCO,Harvard Dataverse,0-10 words
110355,Replication Data for: Sequential Monte Carlo f...,R,gpt-3.5-turbo,"{Computational biology, Genetics, ""Data archit...",set(),Dataverse_10.7910_DVN_O38OCG,Harvard Dataverse,11-20 words
60573,Farm inventory surveys in Rwanda: Birds data,Birds data from Rwanda,gpt-3.5-turbo,"{Biodiversity, Agricultural science, Zoology}",set(),Dataverse_10.34725_DVN_JMJWYS,Harvard Dataverse,11-20 words
108335,Replication Data for: A meta-analytic cognitiv...,R,gpt-3.5-turbo,"{Computational biology, ""Data architecture, an...",set(),Dataverse_10.7910_DVN_N9EJNR,Harvard Dataverse,11-20 words
62233,"Romeinse constructie onder water, Maasbodem. P...",Dendrochronological research project,gpt-3.5-turbo,{Environmental sciences},"{'Archaeology', 'Dendrochronology'}",Dataverse_10.34894_DAILQJ,Harvard Dataverse,11-20 words
104282,State Legislative Historical Elections,State legislative general elections prior to 1...,gpt-3.5-turbo,{},"{'Social sciences', 'Political science', 'Rese...",Dataverse_10.7910_DVN_LEMNXZ,Harvard Dataverse,11-20 words


In [16]:
samples_df.drop(['Predictions', 'Hallucinations'], axis=1, inplace=True)

In [18]:
samples_df.reset_index(drop=True, inplace=True)

In [20]:
samples_df.to_csv('outputs/hzm_word_count_samples.csv', index=False)

In [32]:
filename = 'outputs/hzm.csv'
outputs = pd.read_csv(filename, lineterminator='\n')

In [33]:
outputs['Predictions'] = outputs['Predictions'].apply(lambda preds: [re.sub(r'^\d+\.\s*', '', pred).strip('- ') for pred in preds.split('\n')])

In [40]:
samples_df.head()

Unnamed: 0,Name,Description,Model,_id,Repository,Word Count Range
0,Konzeptstudie über die Kompatibilität von Regi...,I,gpt-3.5-turbo,Dataverse_10.7910_DVN_UMKWCI,Harvard Dataverse,0-10 words
1,A13_24998.jpg,Link to OCHRE database: http://pi.lib.uchicago...,gpt-3.5-turbo,Dataverse_10.7910_DVN_CJDNIE,Harvard Dataverse,0-10 words
2,hou00201c00035,D,gpt-3.5-turbo,Dataverse_10.7910_DVN_VH5JLZ,Harvard Dataverse,0-10 words
3,A12_22337.JPG,Link to OCHRE database: http://pi.lib.uchicago...,gpt-3.5-turbo,Dataverse_10.7910_DVN_JOAUYR,Harvard Dataverse,0-10 words
4,Gent_2009_Peperstraat,Dendrochronological research project,gpt-3.5-turbo,Dataverse_10.34894_JBLFCO,Harvard Dataverse,0-10 words


In [41]:
if all(samples_df['_id'] == outputs['_id']):
    outputs['Repository'] = samples_df['Repository']
    outputs['Word Count Range'] = samples_df['Word Count Range']

In [42]:
outputs.to_csv(filename, index=False, lineterminator='\n')