In [2]:
import pandas as pd
import numpy as np
import math

In [4]:
df = pd.read_csv('covid_abstracts_only.csv')
df.head()

Unnamed: 0,id,subject_areas,title,abstract
0,2005.13653,"('Biomolecules', 'Quantitative Methods')",unveiling the molecular mechanism of sars-cov-...,"Currently, there is no effective antiviral dru..."
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",emotion-robust eeg classification for motor im...,Developments in Brain Computer Interfaces (BCI...
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",estimates of the proportion of sars-cov-2 infe...,In this paper a Bayesian SEIR model is studied...
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",a mathematical epidemic model using genetic fi...,A compartmental epidemic model based on geneti...
4,2005.13466,"('Social and Information Networks', 'Cryptogra...",on the detection of disinformation campaign ac...,Online manipulation of information has become ...


In [5]:
df_nonull=df.dropna(subset=['abstract','subject_areas'])
df_nonull.reset_index()
df_nonull

Unnamed: 0,id,subject_areas,title,abstract
0,2005.13653,"('Biomolecules', 'Quantitative Methods')",unveiling the molecular mechanism of sars-cov-...,"Currently, there is no effective antiviral dru..."
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",emotion-robust eeg classification for motor im...,Developments in Brain Computer Interfaces (BCI...
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",estimates of the proportion of sars-cov-2 infe...,In this paper a Bayesian SEIR model is studied...
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",a mathematical epidemic model using genetic fi...,A compartmental epidemic model based on geneti...
4,2005.13466,"('Social and Information Networks', 'Cryptogra...",on the detection of disinformation campaign ac...,Online manipulation of information has become ...
...,...,...,...,...
40379,85066470790,"('Microbiology', 'Immunology', 'Insect Science...",arterivirus nsp4 antagonizes interferon beta p...,¬© 2019 American Society for Microbiology. All...
40380,85065035348,"('Drug Discovery',)",architectures and mechanical properties of dru...,¬© 2019 Bentham Science Publishers.Background:...
40381,85073540376,"('Parasitology', 'Microbiology', 'Immunology',...",respiratory syncytial virus nonstructural prot...,¬© 2019 Sedeyn et al. This is an open access a...
40382,85070056122,"('Microbiology', 'Immunology', 'Insect Science...",porcine reproductive and respiratory syndrome ...,Copyright ¬© 2019 American Society for Microbi...


In [6]:
len(df_nonull)

13906

Steps: 
1. Determine top 30 most commonly occurring subject labels, and filter down so they aren't redundant. 
2. Filter dataframe to papers that only contain at least one of these subject labels. 
3. Reduce each item to one label - pick the most common label for each paper. 

## Identifying top 30 subject labels

In [7]:
subjects = df_nonull['subject_areas']
subjects = pd.DataFrame(subjects)
subjects.head()

Unnamed: 0,subject_areas
0,"('Biomolecules', 'Quantitative Methods')"
1,"('Signal Processing', 'Human-Computer Interact..."
2,"('Populations and Evolution', 'Physics and Soc..."
3,"('Populations and Evolution', 'Quantitative Me..."
4,"('Social and Information Networks', 'Cryptogra..."


In [8]:
subjects = pd.concat([subjects, subjects['subject_areas'].str.split("', '", expand=True)], axis=1)
subjects.head()

Unnamed: 0,subject_areas,0,1,2,3,4,5,6,7
0,"('Biomolecules', 'Quantitative Methods')",('Biomolecules,Quantitative Methods'),,,,,,
1,"('Signal Processing', 'Human-Computer Interact...",('Signal Processing,Human-Computer Interaction,Machine Learning,Machine Learning'),,,,
2,"('Populations and Evolution', 'Physics and Soc...",('Populations and Evolution,Physics and Society'),,,,,,
3,"('Populations and Evolution', 'Quantitative Me...",('Populations and Evolution,Quantitative Methods'),,,,,,
4,"('Social and Information Networks', 'Cryptogra...",('Social and Information Networks,Cryptography and Security'),,,,,,


In [9]:
# Loop through and remove special characters
for i in range(8): 
    subjects[i] = subjects[i].str.replace('(','')
    subjects[i] = subjects[i].str.replace(')','')
    subjects[i] = subjects[i].str.replace("'",'')
    subjects[i] = subjects[i].str.replace(",",'')
subjects.head()

Unnamed: 0,subject_areas,0,1,2,3,4,5,6,7
0,"('Biomolecules', 'Quantitative Methods')",Biomolecules,Quantitative Methods,,,,,,
1,"('Signal Processing', 'Human-Computer Interact...",Signal Processing,Human-Computer Interaction,Machine Learning,Machine Learning,,,,
2,"('Populations and Evolution', 'Physics and Soc...",Populations and Evolution,Physics and Society,,,,,,
3,"('Populations and Evolution', 'Quantitative Me...",Populations and Evolution,Quantitative Methods,,,,,,
4,"('Social and Information Networks', 'Cryptogra...",Social and Information Networks,Cryptography and Security,,,,,,


In [13]:
unique_subjects = pd.unique(subjects.values.ravel('K'))

In [14]:
len(unique_subjects)

500

In [15]:
subjects['subjects_list'] = subjects[subjects.columns[:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1)

In [16]:
subjects.head()

Unnamed: 0,0,1,2,3,4,5,6,7,subjects_list
0,Biomolecules,Quantitative Methods,,,,,,,Biomolecules Quantitative Methods
1,Signal Processing,Human-Computer Interaction,Machine Learning,Machine Learning,,,,,Signal Processing Human-Computer Interaction M...
2,Populations and Evolution,Physics and Society,,,,,,,Populations and Evolution Physics and Society
3,Populations and Evolution,Quantitative Methods,,,,,,,Populations and Evolution Quantitative Methods
4,Social and Information Networks,Cryptography and Security,,,,,,,Social and Information Networks Cryptography a...


In [17]:
sep_subjects = subjects.iloc[:,0:8]
print(sep_subjects.shape)
sep_subjects.tail()

(13906, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
40379,Microbiology,Immunology,Insect Science,Virology,,,,
40380,Drug Discovery,,,,,,,
40381,Parasitology,Microbiology,Immunology,Molecular Biology,Genetics,Virology,,
40382,Microbiology,Immunology,Insect Science,Virology,,,,
40383,Microbiology,Microbiology medical,,,,,,


Subject_count shows how many times each subject is listed for a paper. (Including when papers have multiple labels)

In [31]:
for i in range(len(unique_subjects)):
    if isinstance(unique_subjects[i], str) == False:
        print(i,unique_subjects[i] )

413 None


In [32]:
# Removing 413 because None
unique_subjects = np.delete(unique_subjects,413,0)
len(unique_subjects)
#unique_subjects = unique_subjects.delete([404])
#del unique_subjects[404]

499

In [33]:
subject_count = pd.DataFrame(columns=['subject','count'])
subject_count['subject'] = unique_subjects
subject_count.head()

Unnamed: 0,subject,count
0,Biomolecules,
1,Signal Processing,
2,Populations and Evolution,
3,Social and Information Networks,
4,Computation and Language,


Remove all the redundant/duplicate terms in the top 30 -"Medicine all", 

In [35]:
# DONE: removed Medicine all, 78
# subject_count.loc[subject_count['subject']=="Medicine all"] -> 78
subject_count = subject_count.drop(78,0)
subject_count = subject_count.reset_index(drop=True)
subject_count

Unnamed: 0,subject,count
0,Biomolecules,
1,Signal Processing,
2,Populations and Evolution,
3,Social and Information Networks,
4,Computation and Language,
...,...,...
493,Control and Optimization,
494,Chemical Health and Safety,
495,Colloid and Surface Chemistry,
496,LPN and LVN,


In [36]:
subject_count['count']=""
for i in range(len(subject_count)):
    current_subject = subject_count['subject'][i]
    subject_count['count'][i] = subjects.subjects_list.str.count(current_subject).sum()

In [46]:
subject_count = subject_count.sort_values(by=['count'],ascending=False)
subject_count = subject_count.reset_index(drop=True)
subject_count.head()

Unnamed: 0,subject,count
0,Infectious Diseases,1867
1,Microbiology,1462
2,Immunology,1201
3,Virology,930
4,Public Health Environmental and Occupational H...,909


In [40]:
# new column in subjects, indicating if at least one of the top 30 is in the row
#sep_subjects['single_label']=""

# loop through each row.
for index,rows in sep_subjects.iterrows(): 
    #print(index)
    # check each subject from common to least common
    for i in subject_count['subject']: 
         # if the subject is found, set it to be the chosen label, then exit the for loop. otherwise continue searching.
        if np.sum(sep_subjects.loc[index,:].values == i) > 0: 
            sep_subjects.loc[index,'single_label'] = i
            break

In [41]:
# THIS IS A TESTING CELL 
print(len(sep_subjects[sep_subjects['single_label']==""]), "blank label rows removed")
print(13906-len(sep_subjects[sep_subjects['single_label']==""]), "remaining rows")
#sep_subjects.head()
#print(len(sep_subjects))

0 blank label rows removed
13906 remaining rows


In [47]:
sep_subjects['single_label'].value_counts().head()

Infectious Diseases                                    1866
Public Health Environmental and Occupational Health     639
Surgery                                                 568
Psychiatry and Mental Health                            375
Radiology Nuclear Medicine and Imaging                  368
Name: single_label, dtype: int64

In [48]:
#sep_subjects.head(n=50)
sep_subjects.shape
#sep_subjects.loc[0,'single_label'] #= 'test'
#np.sum(sep_subjects.iloc[0,:].values == 'Biomolecules')

(13906, 9)

In [49]:
combined_nonull = df_nonull.join(sep_subjects['single_label'])
combined_nonull

Unnamed: 0,id,subject_areas,title,abstract,single_label
0,2005.13653,"('Biomolecules', 'Quantitative Methods')",unveiling the molecular mechanism of sars-cov-...,"Currently, there is no effective antiviral dru...",Quantitative Methods
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",emotion-robust eeg classification for motor im...,Developments in Brain Computer Interfaces (BCI...,Machine Learning
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",estimates of the proportion of sars-cov-2 infe...,In this paper a Bayesian SEIR model is studied...,Populations and Evolution
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",a mathematical epidemic model using genetic fi...,A compartmental epidemic model based on geneti...,Populations and Evolution
4,2005.13466,"('Social and Information Networks', 'Cryptogra...",on the detection of disinformation campaign ac...,Online manipulation of information has become ...,Social and Information Networks
...,...,...,...,...,...
40379,85066470790,"('Microbiology', 'Immunology', 'Insect Science...",arterivirus nsp4 antagonizes interferon beta p...,¬© 2019 American Society for Microbiology. All...,Microbiology
40380,85065035348,"('Drug Discovery',)",architectures and mechanical properties of dru...,¬© 2019 Bentham Science Publishers.Background:...,Drug Discovery
40381,85073540376,"('Parasitology', 'Microbiology', 'Immunology',...",respiratory syncytial virus nonstructural prot...,¬© 2019 Sedeyn et al. This is an open access a...,Microbiology
40382,85070056122,"('Microbiology', 'Immunology', 'Insect Science...",porcine reproductive and respiratory syndrome ...,Copyright ¬© 2019 American Society for Microbi...,Microbiology


In [51]:
# REMOVING THE BLANK LABEL ROWS -> SHOULD BE 12810 REMAINING
combined_nonull = combined_nonull.loc[combined_nonull['single_label']!=None]
combined_nonull

Unnamed: 0,id,subject_areas,title,abstract,single_label
0,2005.13653,"('Biomolecules', 'Quantitative Methods')",unveiling the molecular mechanism of sars-cov-...,"Currently, there is no effective antiviral dru...",Quantitative Methods
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",emotion-robust eeg classification for motor im...,Developments in Brain Computer Interfaces (BCI...,Machine Learning
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",estimates of the proportion of sars-cov-2 infe...,In this paper a Bayesian SEIR model is studied...,Populations and Evolution
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",a mathematical epidemic model using genetic fi...,A compartmental epidemic model based on geneti...,Populations and Evolution
4,2005.13466,"('Social and Information Networks', 'Cryptogra...",on the detection of disinformation campaign ac...,Online manipulation of information has become ...,Social and Information Networks
...,...,...,...,...,...
40379,85066470790,"('Microbiology', 'Immunology', 'Insect Science...",arterivirus nsp4 antagonizes interferon beta p...,¬© 2019 American Society for Microbiology. All...,Microbiology
40380,85065035348,"('Drug Discovery',)",architectures and mechanical properties of dru...,¬© 2019 Bentham Science Publishers.Background:...,Drug Discovery
40381,85073540376,"('Parasitology', 'Microbiology', 'Immunology',...",respiratory syncytial virus nonstructural prot...,¬© 2019 Sedeyn et al. This is an open access a...,Microbiology
40382,85070056122,"('Microbiology', 'Immunology', 'Insect Science...",porcine reproductive and respiratory syndrome ...,Copyright ¬© 2019 American Society for Microbi...,Microbiology


In [52]:
label_counts = pd.DataFrame(combined_nonull['single_label'].value_counts())
label_counts = label_counts.head(n=35)
label_counts

Unnamed: 0,single_label
Infectious Diseases,1866
Public Health Environmental and Occupational Health,639
Surgery,568
Psychiatry and Mental Health,375
Radiology Nuclear Medicine and Imaging,368
Biochemistry Genetics and Molecular Biology all,359
Cardiology and Cardiovascular Medicine,332
Populations and Evolution,330
Immunology,311
Biochemistry,260


Narrowing combined_nonull down into the top 35 classes. 

In [53]:
# New col top_35_label
combined_nonull_35 = combined_nonull
combined_nonull_35['top_35_label'] = ""

for index,row in combined_nonull_35.iterrows(): 
    if combined_nonull_35.loc[index,'single_label'] in label_counts.index.to_list():
        combined_nonull_35.loc[index,'top_35_label'] = combined_nonull_35.loc[index,'single_label']

combined_nonull_35
# Remove all rows with ""  top_35_label

Unnamed: 0,id,subject_areas,title,abstract,single_label,top_35_label
0,2005.13653,"('Biomolecules', 'Quantitative Methods')",unveiling the molecular mechanism of sars-cov-...,"Currently, there is no effective antiviral dru...",Quantitative Methods,
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",emotion-robust eeg classification for motor im...,Developments in Brain Computer Interfaces (BCI...,Machine Learning,Machine Learning
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",estimates of the proportion of sars-cov-2 infe...,In this paper a Bayesian SEIR model is studied...,Populations and Evolution,Populations and Evolution
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",a mathematical epidemic model using genetic fi...,A compartmental epidemic model based on geneti...,Populations and Evolution,Populations and Evolution
4,2005.13466,"('Social and Information Networks', 'Cryptogra...",on the detection of disinformation campaign ac...,Online manipulation of information has become ...,Social and Information Networks,
...,...,...,...,...,...,...
40379,85066470790,"('Microbiology', 'Immunology', 'Insect Science...",arterivirus nsp4 antagonizes interferon beta p...,¬© 2019 American Society for Microbiology. All...,Microbiology,Microbiology
40380,85065035348,"('Drug Discovery',)",architectures and mechanical properties of dru...,¬© 2019 Bentham Science Publishers.Background:...,Drug Discovery,
40381,85073540376,"('Parasitology', 'Microbiology', 'Immunology',...",respiratory syncytial virus nonstructural prot...,¬© 2019 Sedeyn et al. This is an open access a...,Microbiology,Microbiology
40382,85070056122,"('Microbiology', 'Immunology', 'Insect Science...",porcine reproductive and respiratory syndrome ...,Copyright ¬© 2019 American Society for Microbi...,Microbiology,Microbiology


In [54]:
combined_nonull_35 = combined_nonull_35.drop(combined_nonull_35[combined_nonull_35.top_35_label == ""].index)
combined_nonull_35

Unnamed: 0,id,subject_areas,title,abstract,single_label,top_35_label
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",emotion-robust eeg classification for motor im...,Developments in Brain Computer Interfaces (BCI...,Machine Learning,Machine Learning
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",estimates of the proportion of sars-cov-2 infe...,In this paper a Bayesian SEIR model is studied...,Populations and Evolution,Populations and Evolution
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",a mathematical epidemic model using genetic fi...,A compartmental epidemic model based on geneti...,Populations and Evolution,Populations and Evolution
8,2005.13285,"('Quantitative Methods', 'Machine Learning', '...",paccmannrl on sars-cov-2: designing antiviral ...,With the fast development of COVID-19 into a g...,Machine Learning,Machine Learning
9,2005.13282,"('Populations and Evolution', 'Social and Info...",simulation of the covid-19 pandemic on the soc...,In the article a virus transmission model is c...,Populations and Evolution,Populations and Evolution
...,...,...,...,...,...,...
40378,85069597990,"('Biochemistry, Genetics and Molecular Biology...",effectiveness of zinc supplementation on diarr...,¬© 2019 Feldmann et al. This is an open access...,Biochemistry Genetics and Molecular Biology all,Biochemistry Genetics and Molecular Biology all
40379,85066470790,"('Microbiology', 'Immunology', 'Insect Science...",arterivirus nsp4 antagonizes interferon beta p...,¬© 2019 American Society for Microbiology. All...,Microbiology,Microbiology
40381,85073540376,"('Parasitology', 'Microbiology', 'Immunology',...",respiratory syncytial virus nonstructural prot...,¬© 2019 Sedeyn et al. This is an open access a...,Microbiology,Microbiology
40382,85070056122,"('Microbiology', 'Immunology', 'Insect Science...",porcine reproductive and respiratory syndrome ...,Copyright ¬© 2019 American Society for Microbi...,Microbiology,Microbiology


In [57]:
combined_nonull_35['top_35_label'].value_counts()

Infectious Diseases                                    1866
Public Health Environmental and Occupational Health     639
Surgery                                                 568
Biochemistry Genetics and Molecular Biology all         567
Psychiatry and Mental Health                            375
Radiology Nuclear Medicine and Imaging                  368
Cardiology and Cardiovascular Medicine                  332
Populations and Evolution                               330
Immunology                                              311
Biochemistry                                            260
Pharmacology                                            220
Microbiology                                            202
Otorhinolaryngology                                     185
Education                                               185
Geography Planning and Development                      182
Pediatrics Perinatology and Child Health                182
Multidisciplinary                       

In [62]:
# Regrouping certain categories
for index,row in combined_nonull_35.iterrows(): 
    if combined_nonull_35.loc[index,'single_label']=="Pharmacology medical":
        combined_nonull_35.loc[index,'top_35_label'] = "Pharmacology"

combined_nonull_35['top_35_label'].value_counts().shape # should be 31 labels left!

(31,)

Next steps: 
1. Sort for_export by class label (alpha)
2. Loop through each top_35_label index
3. Set 'test_limit' to be round(0.8 * count of label)
Split each class into 80% test, 20% train 
2. 
4. Exporting the txt files!

Notes:
8608 in the top 30 label classes 
7976 in the top 25 classes
9187 in 31 (some combned from 35) classes

In [67]:
combined_nonull_35['title_abstract'] = combined_nonull_35['title'] + '. ' + combined_nonull_35['abstract']
#combined_nonull_35.loc[1,'title_abstract']

In [69]:
# Sorting labels and counts alphabetically
top_35_labels = pd.DataFrame(combined_nonull_35['top_35_label'].value_counts())
top_35_labels = top_35_labels.sort_index()

In [73]:
# for i in top_35_labels.index:
#     print(math.ceil(top_35_labels.loc[i].values[0]*0.8))
top_35_labels.to_csv('top_35_labels_count.csv')

In [71]:
# sorting the df alphabetically by label
for_export = combined_nonull_35[['id','title_abstract','top_35_label']]
for_export = for_export.sort_values('top_35_label')

In [74]:
for_export['test_train'] = ""
for_export.to_csv('id_abstract_label_testtrain.csv')

In [96]:
# Setting 80% of each class to be train. 
for i in range(len(top_35_labels.index)):
    current_label = top_35_labels.index[i]
    train_limit = math.ceil(top_35_labels.loc[current_label].values[0]*0.8)
    n_train = 0
    temp_df = for_export.loc[for_export['top_35_label']==current_label]
    
    for j in temp_df.index: 
        if n_train < train_limit: 
            for_export.loc[j,'test_train'] = 'train'
            n_train += 1
        else:
            for_export.loc[j,'test_train'] = 'test'

In [108]:
for_export
# path | test/train | label
# path = 'data/covid_19_production/' + test_train + '/' + id

Unnamed: 0,id,title_abstract,top_35_label,test_train
17184,85083159584,initial clinical impressions of the critical c...,Anesthesiology and Pain Medicine,train
33483,85085700154,the aerosol box for intubation in coronavirus ...,Anesthesiology and Pain Medicine,train
21990,85086169697,telemedicine for chronic pain management durin...,Anesthesiology and Pain Medicine,train
26905,85084617413,coronavirus disease 2019 (covid-19): two case ...,Anesthesiology and Pain Medicine,train
25670,85085755714,re-emergence of tiva in covid times. ¬© 2020 I...,Anesthesiology and Pain Medicine,train
...,...,...,...,...
18625,85086928118,rapid differentiation of pedv wild-type strain...,Veterinary all,test
40167,85077224435,evaluation of serological assays available in ...,Veterinary all,test
28598,85082507850,looking after yourself and others. British Vet...,Veterinary all,test
39903,85066491992,characterization of antiviral t cell responses...,Veterinary all,test


In [109]:
file_details = for_export[['id','test_train','top_35_label']]
file_details

Unnamed: 0,id,test_train,top_35_label
17184,85083159584,train,Anesthesiology and Pain Medicine
33483,85085700154,train,Anesthesiology and Pain Medicine
21990,85086169697,train,Anesthesiology and Pain Medicine
26905,85084617413,train,Anesthesiology and Pain Medicine
25670,85085755714,train,Anesthesiology and Pain Medicine
...,...,...,...
18625,85086928118,test,Veterinary all
40167,85077224435,test,Veterinary all
28598,85082507850,test,Veterinary all
39903,85066491992,test,Veterinary all


In [119]:
for i in file_details.index: 
    file_details.loc[i,'path'] = 'data/covid_19_production/' + file_details.loc[i,'test_train'] + '/' + file_details.loc[i,'id']
file_details

Unnamed: 0,path,id,test_train,top_35_label
17184,data/covid_19_production/train/85083159584,85083159584,train,Anesthesiology and Pain Medicine
33483,data/covid_19_production/train/85085700154,85085700154,train,Anesthesiology and Pain Medicine
21990,data/covid_19_production/train/85086169697,85086169697,train,Anesthesiology and Pain Medicine
26905,data/covid_19_production/train/85084617413,85084617413,train,Anesthesiology and Pain Medicine
25670,data/covid_19_production/train/85085755714,85085755714,train,Anesthesiology and Pain Medicine
...,...,...,...,...
18625,data/covid_19_production/test/85086928118,85086928118,test,Veterinary all
40167,data/covid_19_production/test/85077224435,85077224435,test,Veterinary all
28598,data/covid_19_production/test/85082507850,85082507850,test,Veterinary all
39903,data/covid_19_production/test/85066491992,85066491992,test,Veterinary all


In [121]:
file_details = file_details[['path','test_train','top_35_label']]
file_details

Unnamed: 0,path,test_train,top_35_label
17184,data/covid_19_production/train/85083159584,train,Anesthesiology and Pain Medicine
33483,data/covid_19_production/train/85085700154,train,Anesthesiology and Pain Medicine
21990,data/covid_19_production/train/85086169697,train,Anesthesiology and Pain Medicine
26905,data/covid_19_production/train/85084617413,train,Anesthesiology and Pain Medicine
25670,data/covid_19_production/train/85085755714,train,Anesthesiology and Pain Medicine
...,...,...,...
18625,data/covid_19_production/test/85086928118,test,Veterinary all
40167,data/covid_19_production/test/85077224435,test,Veterinary all
28598,data/covid_19_production/test/85082507850,test,Veterinary all
39903,data/covid_19_production/test/85066491992,test,Veterinary all


In [122]:
file_details.to_csv('covid_19_production.txt',sep='\t',header=False, index=False)

In [126]:
txt_export = for_export
txt_export

Unnamed: 0,id,title_abstract,top_35_label,test_train
17184,85083159584,initial clinical impressions of the critical c...,Anesthesiology and Pain Medicine,train
33483,85085700154,the aerosol box for intubation in coronavirus ...,Anesthesiology and Pain Medicine,train
21990,85086169697,telemedicine for chronic pain management durin...,Anesthesiology and Pain Medicine,train
26905,85084617413,coronavirus disease 2019 (covid-19): two case ...,Anesthesiology and Pain Medicine,train
25670,85085755714,re-emergence of tiva in covid times. ¬© 2020 I...,Anesthesiology and Pain Medicine,train
...,...,...,...,...
18625,85086928118,rapid differentiation of pedv wild-type strain...,Veterinary all,test
40167,85077224435,evaluation of serological assays available in ...,Veterinary all,test
28598,85082507850,looking after yourself and others. British Vet...,Veterinary all,test
39903,85066491992,characterization of antiviral t cell responses...,Veterinary all,test


In [131]:
for i in txt_export.index: 
    txt_data = txt_export.loc[i,'title_abstract']
    path = txt_export.loc[i,'test_train'] + '/' + txt_export.loc[i,'id'] + '.txt'
    file = open(path,"w")
    file.write(txt_data)
    file.close()