In [1]:
#includes
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import statistics
import glob
import re
from prettytable import PrettyTable
import seaborn as sns


from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

from ipynb.fs.full. Stevan_part import *

# Step 1 - Data Preprocessing

## Gather RNA counts

In [2]:
#Dataframe lines: customers, columns: genes
class SampleMerger:
    def __init__(self):          #constructor
        self.sample_files = None #files names
        self.df = None           #dataframe
    
    def __str__(self):           # print with str format
        if self.df is None:
            self.merge_samples()
        return str(self.df)
    
    #Getters
    def get_sample_files(self): 
        if self.sample_files is None:
            self.set_sample_files()
        return self.sample_files
    
    def get_df(self):            
        if self.df is None:
            self.merge_samples()
        return self.df
    
    def get_sub_dataframe(self, columns):
        if self.df is None:
            self.merge_samples()
        sub_df = self.df.loc[:, columns]
        return sub_df 
    
    #Setters
    def set_sample_files(self):  #load files names
        txt_files = []
        folder_path = '../Data/'

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.txt'):
                txt_files.append(os.path.join(folder_path, file_name))
        self.sample_files = txt_files
    
    def merge_samples(self): #Create the dataframe
        count = 0
        i = 0
        j = -1
        gene_list = []
        nb_gene_list = [ [ None for y in range( len(self.sample_files) ) ]for x in range( 28953 ) ]
        
        for file in self.sample_files:
            j += 1
            with open(file) as fasta_file:
                for line in fasta_file:
                    line = line.strip()
                    if not(line.startswith("g")):
                        gene = line.split("\t")[0]
                        nb_gene = line.split("\t")[1]
                        if (count == 0):
                            gene_list.append(gene)
                        nb_gene_list[i][j] = int(nb_gene)
                        i += 1
            count = 1
            i = 0
        dictionnary = dict()
        for i in range (0, 28953):
            dictionnary[gene_list[i]] = nb_gene_list[i]
        self.df = pd.DataFrame(dictionnary)
        
        names = []
        for item in self.sample_files:
            name = item.split('/')[-1].split('.')[0]
            names.append(name)
        self.df.index = names
        
    #Step 2
    def get_mean_dict(self):
        """
        df: dataframe
        return the mean for each gene as a dictionary {key: gene; value: mean}
        """
        mean_dict = dict()
        for col in self.df.columns:
            mean_dict[col] = self.df[col].mean()
        return mean_dict

    def get_median_dict(self):
        """
        df: dataframe
        return the median for each gene as a dictionary {key: gene; value: median}
        """
        median_dict = dict()
        for col in self.df.columns:
            median_dict[col] = self.df[col].median()
        return median_dict

    def get_stdev_dict(self):
        """
        df: dataframe
        return the standard deviation for each gene as a dictionary {key: gene; value: standard deviation}
        """
        stdev_dict = dict()
        for col in self.df.columns:
            stdev_dict[col] = self.df[col].std()
        return stdev_dict
    
    def test_data(self): #tests
        if self.df is None:
            self.merge_samples()
        assert self.df.isnull().sum().sum() == 0, "There are missing values in the merged dataframe."
        assert isinstance(self.df, pd.DataFrame), "The merged object is not a dataframe."
        return True

In [3]:
Frame = SampleMerger()
Frame.set_sample_files()
Frame.merge_samples()


In [4]:
Frame.test_data()

True

In [5]:
Frame.get_df()

Unnamed: 0,"""A1BG""","""A1BG-AS1""","""A1CF""","""A2M""","""A2M-AS1""","""A2ML1""","""A2MP1""","""A3GALT2""","""A4GALT""","""A4GNT""",...,X6B_LINE:CR1:LINE,X7A_LINE:CR1:LINE,X7B_LINE:CR1:LINE,X7C_LINE:CR1:LINE,X7D_LINE:CR1:LINE,X8_LINE:CR1:LINE,X9_LINE:L1:LINE,Zaphod2:hAT-Tip100:DNA,Zaphod3:hAT-Tip100:DNA,Zaphod:hAT-Tip100:DNA
GSM3533311_CGND-HRA-00350_counts,27,89,0,1627,85,162,3,0,8,0,...,727,1414,1479,724,89,421,84,410,1199,1462
GSM3533394_CGND-HRA-01236_counts,62,175,0,3614,94,128,3,3,143,0,...,727,1863,1723,966,79,527,94,432,1679,1919
GSM3533310_CGND-HRA-00338_counts,29,88,0,1171,86,105,3,2,37,1,...,809,1790,1590,764,93,435,125,350,1579,1871
GSM3533352_CGND-HRA-00546_counts,16,133,0,1366,84,200,4,3,31,1,...,851,1738,1632,903,82,439,94,423,1704,1780
GSM3533279_CGND-HRA-00226_counts,38,83,0,2391,121,143,6,2,15,1,...,1095,2267,2122,1078,128,555,106,522,2087,2432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM3533344_CGND-HRA-00436_counts,17,43,4,900,73,113,1,2,6,1,...,727,1561,1372,702,83,350,99,351,1256,1479
GSM3533334_CGND-HRA-00409_counts,48,127,0,1635,105,226,1,1,24,0,...,465,1006,1087,519,48,329,86,277,789,905
GSM3533391_CGND-HRA-01219_counts,37,118,0,1245,104,168,1,1,21,1,...,847,1822,1625,903,96,472,120,415,1556,1768
GSM3533263_CGND-HRA-00209_counts,30,63,0,1024,34,56,1,0,9,0,...,288,640,644,352,30,179,46,136,614,723


## Gather sample annotations

In [6]:
class SampleAnnotation:
    def __init__(self):          #constructor
        self.df = None           #dataframe
        
    def get_df(self):
        return self.df  
    
    def collect_annotations(self):
        # Reading the XML file
        url = '{http://www.ncbi.nlm.nih.gov/geo/info/MINiML}'
        tree = ET.parse('../Data/GSE124439_family.xml')
        root = tree.getroot()
        #samples = root.findall(url+"Sample")
        #print(samples)

        # List to store annotation data

        data_list = []

        # Loop to loop through samples and extract other annotations

        for sample in root.findall(url+'Sample'):

            #Sample ID retrieval

            #print(sample)
            sample_id = sample.get('iid')


            # Recovery of annotations

            annotations = {}
            for i in sample.findall(url+'Channel'):
                #print(i)
                for charac in i.findall(url+'Characteristics'):


                    if charac.get('tag') in ['cns subregion', 'subject id', 'sample group']:

                        annotations[charac.get('tag')] = charac.text.strip()

            # Add data to list

            data = {'sample_id': sample_id, **annotations}
            data_list.append(data)

        #print(ET.tostring(root, encoding='unicode'))    

        # Creating the DataFrame from the data

        self.df = pd.DataFrame(data_list)  
        

    def __str__(self):
        if self.df is None:
            self.collect_annotations()
            
        return str(self.df)
    
    def get_sub_dataframe(self, columns):
        if self.df is None:
            self.merge_samples()
        sub_df = self.df.loc[:, columns]
        return sub_df 

# Step 2 - Descriptive analysis

## RNA counts description:

For each gene, compute the mean, the median and the standard deviation

In [8]:
mean_dict = Frame.get_mean_dict()
median_dict = Frame.get_median_dict()
stdev_dict = Frame.get_stdev_dict()

In [19]:
#USELESS, DO NOT WORK
"""
plt.figure(figsize=(10,6))
data = pd.DataFrame(list(mean_dict.items()))
sns.barplot(data = data, x="label", y="count", palette="coolwarm" )

plt.xticks(rotation= 90)
plt.show()
"""

[0, 1]


ValueError: Could not interpret input 'label'

<Figure size 1000x600 with 0 Axes>