# XML parsing 
In this file we will parse the xml data files

1. First we loop through xml files and get a very long list of dictionaries
2. We turn the list of dictionaries into a python data frame
3. Then we get the data for the sample, stored in subdictionaries in the 'Samples' field of the main dataframe
4. Number of samples to the records df
5. Output is saved two dataframes, converted to pkl files, in the same folder as the raw data files:
    - records_df --> records.pkl
    - samples_df --> samples.pkl

In [1]:
from Bio import Entrez
import pandas as pd
import glob
import os

# Parsing raw data
- First we loop through xml files and get a very long list of dictionaries.
- We turn the list of dictionaries into a pandas data frame.
- Then we go through each row of the larger dataframe and get the sample data from each row (takes a while).

In [None]:
Entrez.email = "A.N.Other@example.com" # Always tell NCBI who you are

data_path = '../data/raw' # rename with template
output_data_path = '../data/records_samples' # rename with template
file_base_name = "all_gse_series_homo_sapiens_part"
output_file_rec = os.path.join(output_data_path, 'records.pkl')
output_file_sam = os.path.join(output_data_path, 'samples.pkl')
raw_files = sorted(glob.glob(os.path.join(data_path, file_base_name+'*')))

if os.path.isdir(output_data_path)!=1:
    os.mkdir(output_data_path)

record_list = []
for ifile in raw_files:
    print('Parsing ',ifile)
    handle = open(ifile)
    records = Entrez.parse(handle)
    for record in records:
        record_list.append(record)
        
records_df = pd.DataFrame(record_list)

samples_df = pd.DataFrame()
for i in range(len(records_df)):
    samples_aux = pd.DataFrame(records_df.loc[i].Samples)
    samples_aux['Id'] = records_df.loc[i].Id
    samples_aux['nsamples'] = len(samples_aux)
    samples_df = samples_df.append(samples_aux)
    if i%5000==0:
        print('Sample iteration:')
        print(i)

samples_df = samples_df.set_index(['Id','nsamples'])
print('Saving samples to ', output_file_sam)
samples_df.to_pickle(output_file_sam)

# take sample count from samples and put in records
samples_df['dum'] = 1
n_samples_df = samples_df.reset_index().groupby(['Id','nsamples']).mean().reset_index()
records_df = pd.merge(n_samples_df[['Id','nsamples']],records_df, on='Id', how='right')

print('')
print('Saving records to ', output_file_rec)
print('')
records_df.to_pickle(output_file_rec)

print('Done.')