In [4]:
import pandas as pd
import random

<h3>Pandas Series</h3>
<p>We can build a Pandas Series by using other data structures, such as lists:</p>

In [12]:
value1 = [] #Created an empty list
for x in range(5):
    value1.append(random.randint(-5,5)) #Appended the random values to the list

s1 = pd.Series(value1)
s2 = pd.Series(value1, index= ["a", "b", "c", "d", "e"])

In [13]:
s2

a    0
b   -2
c    2
d    1
e    0
dtype: int64

<p>Or Dictionaries</p>

In [15]:
bacterialData = {"Escherichia coli K-12":4.6,
                "Streptomyces coelicolor A3":8.7,
                "Bacillus subtilis 168":4.2,
                "Pseudomonas aeruginosa PAO1":6.3,
                "Mycobacterium tuberculosis H37Rv":4.4}

bacteriaSeries = pd.Series(bacterialData)
bacteriaSeries

Escherichia coli K-12               4.6
Streptomyces coelicolor A3          8.7
Bacillus subtilis 168               4.2
Pseudomonas aeruginosa PAO1         6.3
Mycobacterium tuberculosis H37Rv    4.4
dtype: float64

In [16]:
bacteriaSeries >6

Escherichia coli K-12               False
Streptomyces coelicolor A3           True
Bacillus subtilis 168               False
Pseudomonas aeruginosa PAO1          True
Mycobacterium tuberculosis H37Rv    False
dtype: bool

In [17]:
bacteriaSeries[bacteriaSeries>6]

Streptomyces coelicolor A3     8.7
Pseudomonas aeruginosa PAO1    6.3
dtype: float64

<p>It's easy to add new elements to a series:</p>

In [18]:
bacteriaSeries["Staphylococcus aureus N315"] = 2.8
bacteriaSeries["Helicobacter pylori 26695"] = 1.7
bacteriaSeries

Escherichia coli K-12               4.6
Streptomyces coelicolor A3          8.7
Bacillus subtilis 168               4.2
Pseudomonas aeruginosa PAO1         6.3
Mycobacterium tuberculosis H37Rv    4.4
Staphylococcus aureus N315          2.8
Helicobacter pylori 26695           1.7
dtype: float64

<p>Or to modify it:</p>

In [19]:
bacteriaSeries["Streptomyces coelicolor A3"] = 7
bacteriaSeries

Escherichia coli K-12               4.6
Streptomyces coelicolor A3          7.0
Bacillus subtilis 168               4.2
Pseudomonas aeruginosa PAO1         6.3
Mycobacterium tuberculosis H37Rv    4.4
Staphylococcus aureus N315          2.8
Helicobacter pylori 26695           1.7
dtype: float64

<p>or:</p>

In [23]:
bacteriaSeries[bacteriaSeries < 3] = 3.5
bacteriaSeries

Escherichia coli K-12               4.6
Streptomyces coelicolor A3          7.0
Bacillus subtilis 168               4.2
Pseudomonas aeruginosa PAO1         6.3
Mycobacterium tuberculosis H37Rv    4.4
Staphylococcus aureus N315          3.5
Helicobacter pylori 26695           3.5
dtype: float64

<div style="padding-top=10px; padding-bottom=10px">
    <h3>DataFrames</h3>
    <p><strong>How to create a dataframe:</p>
</div>

In [25]:
bacterialData = {"Species": ["Escherichia coli K-12","Streptomyces coelicolor A3",
                "Bacillus subtilis 168","Pseudomonas aeruginosa PAO1",
                "Mycobacterium tuberculosis H37Rv"],
                "Genome_size":[4.6,8.7,4.2,6.3,4.4],
                "TranscriptonFactors":[314,965,158,505,189]}

bacterialDataFrame = pd.DataFrame(bacterialData)
bacterialDataFrame

Unnamed: 0,Species,Genome_size,TranscriptonFactors
0,Escherichia coli K-12,4.6,314
1,Streptomyces coelicolor A3,8.7,965
2,Bacillus subtilis 168,4.2,158
3,Pseudomonas aeruginosa PAO1,6.3,505
4,Mycobacterium tuberculosis H37Rv,4.4,189


In [61]:
fullBacteriaData = pd.read_csv("Datasets/bacteria.csv", index_col=0)
fullBacteriaData.head()

Unnamed: 0_level_0,Genome Size (Mbp),Number of Genes,Number of Transcription Factors,Number of Proteins,Genus,Family,GC Content (%)
Bacterium,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Escherichia coli K-12,4.6,4400,314,4290,Escherichia,Enterobacteriaceae,50.8
Streptomyces coelicolor A3(2),8.7,7800,965,7825,Streptomyces,Streptomycetaceae,72.1
Bacillus subtilis 168,4.2,4100,158,4103,Bacillus,Bacillaceae,43.5
Pseudomonas aeruginosa PAO1,6.3,5500,505,5570,Pseudomonas,Pseudomonadaceae,66.6
Mycobacterium tuberculosis H37Rv,4.4,4000,189,4000,Mycobacterium,Mycobacteriaceae,65.6


In [62]:
fullBacteriaData = pd.read_csv("Datasets/bacteria.csv")
print(type(fullBacteriaData))
print(fullBacteriaData)

<class 'pandas.core.frame.DataFrame'>
                                       Bacterium  Genome Size (Mbp)  \
0                          Escherichia coli K-12                4.6   
1                  Streptomyces coelicolor A3(2)                8.7   
2                          Bacillus subtilis 168                4.2   
3                    Pseudomonas aeruginosa PAO1                6.3   
4               Mycobacterium tuberculosis H37Rv                4.4   
..                                           ...                ...   
149         Mycobacterium smegmatis str. MC2 155                6.9   
150                Mycobacterium bovis AF2122/97                4.3   
151  Mycobacterium bovis BCG str. Pasteur 1173P2                4.4   
152                  Mycobacterium leprae Br4923                3.3   
153           Mycobacterium tuberculosis CDC1551                4.4   

     Number of Genes  Number of Transcription Factors  Number of Proteins  \
0               4400            

In [63]:
fullBacteriaData

Unnamed: 0,Bacterium,Genome Size (Mbp),Number of Genes,Number of Transcription Factors,Number of Proteins,Genus,Family,GC Content (%)
0,Escherichia coli K-12,4.6,4400,314,4290,Escherichia,Enterobacteriaceae,50.8
1,Streptomyces coelicolor A3(2),8.7,7800,965,7825,Streptomyces,Streptomycetaceae,72.1
2,Bacillus subtilis 168,4.2,4100,158,4103,Bacillus,Bacillaceae,43.5
3,Pseudomonas aeruginosa PAO1,6.3,5500,505,5570,Pseudomonas,Pseudomonadaceae,66.6
4,Mycobacterium tuberculosis H37Rv,4.4,4000,189,4000,Mycobacterium,Mycobacteriaceae,65.6
...,...,...,...,...,...,...,...,...
149,Mycobacterium smegmatis str. MC2 155,6.9,7000,471,7004,Mycobacterium,Mycobacteriaceae,67.7
150,Mycobacterium bovis AF2122/97,4.3,4000,257,3997,Mycobacterium,Mycobacteriaceae,65.6
151,Mycobacterium bovis BCG str. Pasteur 1173P2,4.4,4200,267,4188,Mycobacterium,Mycobacteriaceae,65.5
152,Mycobacterium leprae Br4923,3.3,1600,50,1604,Mycobacterium,Mycobacteriaceae,57.8


In [64]:
fullBacteriaData.describe()

Unnamed: 0,Genome Size (Mbp),Number of Genes,Number of Transcription Factors,Number of Proteins,GC Content (%)
count,154.0,154.0,154.0,154.0,153.0
mean,3.504156,3253.896104,204.844156,3257.11039,48.75817
std,2.092125,1965.543485,156.865086,1964.615083,15.101924
min,0.2,160.0,1.0,161.0,16.0
25%,1.825,1700.0,91.75,1713.25,38.0
50%,3.35,3200.0,187.5,3205.0,49.5
75%,4.7,4400.0,297.5,4409.75,64.8
max,9.1,8300.0,965.0,8213.0,72.1


<h3>Indexing a dataframe</h3>

In [65]:
fullBacteriaData = pd.read_csv("Datasets/bacteria.csv")
fullBacteriaData

Unnamed: 0,Bacterium,Genome Size (Mbp),Number of Genes,Number of Transcription Factors,Number of Proteins,Genus,Family,GC Content (%)
0,Escherichia coli K-12,4.6,4400,314,4290,Escherichia,Enterobacteriaceae,50.8
1,Streptomyces coelicolor A3(2),8.7,7800,965,7825,Streptomyces,Streptomycetaceae,72.1
2,Bacillus subtilis 168,4.2,4100,158,4103,Bacillus,Bacillaceae,43.5
3,Pseudomonas aeruginosa PAO1,6.3,5500,505,5570,Pseudomonas,Pseudomonadaceae,66.6
4,Mycobacterium tuberculosis H37Rv,4.4,4000,189,4000,Mycobacterium,Mycobacteriaceae,65.6
...,...,...,...,...,...,...,...,...
149,Mycobacterium smegmatis str. MC2 155,6.9,7000,471,7004,Mycobacterium,Mycobacteriaceae,67.7
150,Mycobacterium bovis AF2122/97,4.3,4000,257,3997,Mycobacterium,Mycobacteriaceae,65.6
151,Mycobacterium bovis BCG str. Pasteur 1173P2,4.4,4200,267,4188,Mycobacterium,Mycobacteriaceae,65.5
152,Mycobacterium leprae Br4923,3.3,1600,50,1604,Mycobacterium,Mycobacteriaceae,57.8


In [94]:
fullBacteriaData = pd.read_csv("Datasets/bacteria.csv", index_col=5)
fullBacteriaData

Unnamed: 0_level_0,Bacterium,Genome_Size,Genes,Transcription_Factors,Proteins,Family,GC
Genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Escherichia,Escherichia coli K-12,4.6,4400,314,4290,Enterobacteriaceae,50.8
Streptomyces,Streptomyces coelicolor A3(2),8.7,7800,965,7825,Streptomycetaceae,72.1
Bacillus,Bacillus subtilis 168,4.2,4100,158,4103,Bacillaceae,43.5
Pseudomonas,Pseudomonas aeruginosa PAO1,6.3,5500,505,5570,Pseudomonadaceae,66.6
Mycobacterium,Mycobacterium tuberculosis H37Rv,4.4,4000,189,4000,Mycobacteriaceae,65.6
...,...,...,...,...,...,...,...
Mycobacterium,Mycobacterium smegmatis str. MC2 155,6.9,7000,471,7004,Mycobacteriaceae,67.7
Mycobacterium,Mycobacterium bovis AF2122/97,4.3,4000,257,3997,Mycobacteriaceae,65.6
Mycobacterium,Mycobacterium bovis BCG str. Pasteur 1173P2,4.4,4200,267,4188,Mycobacteriaceae,65.5
Mycobacterium,Mycobacterium leprae Br4923,3.3,1600,50,1604,Mycobacteriaceae,57.8


In [67]:
fullBacteriaData.reset_index()

Unnamed: 0,Genus,Bacterium,Genome Size (Mbp),Number of Genes,Number of Transcription Factors,Number of Proteins,Family,GC Content (%)
0,Escherichia,Escherichia coli K-12,4.6,4400,314,4290,Enterobacteriaceae,50.8
1,Streptomyces,Streptomyces coelicolor A3(2),8.7,7800,965,7825,Streptomycetaceae,72.1
2,Bacillus,Bacillus subtilis 168,4.2,4100,158,4103,Bacillaceae,43.5
3,Pseudomonas,Pseudomonas aeruginosa PAO1,6.3,5500,505,5570,Pseudomonadaceae,66.6
4,Mycobacterium,Mycobacterium tuberculosis H37Rv,4.4,4000,189,4000,Mycobacteriaceae,65.6
...,...,...,...,...,...,...,...,...
149,Mycobacterium,Mycobacterium smegmatis str. MC2 155,6.9,7000,471,7004,Mycobacteriaceae,67.7
150,Mycobacterium,Mycobacterium bovis AF2122/97,4.3,4000,257,3997,Mycobacteriaceae,65.6
151,Mycobacterium,Mycobacterium bovis BCG str. Pasteur 1173P2,4.4,4200,267,4188,Mycobacteriaceae,65.5
152,Mycobacterium,Mycobacterium leprae Br4923,3.3,1600,50,1604,Mycobacteriaceae,57.8


In [68]:
fullBacteriaData.set_index("Family")

Unnamed: 0_level_0,Bacterium,Genome Size (Mbp),Number of Genes,Number of Transcription Factors,Number of Proteins,GC Content (%)
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Enterobacteriaceae,Escherichia coli K-12,4.6,4400,314,4290,50.8
Streptomycetaceae,Streptomyces coelicolor A3(2),8.7,7800,965,7825,72.1
Bacillaceae,Bacillus subtilis 168,4.2,4100,158,4103,43.5
Pseudomonadaceae,Pseudomonas aeruginosa PAO1,6.3,5500,505,5570,66.6
Mycobacteriaceae,Mycobacterium tuberculosis H37Rv,4.4,4000,189,4000,65.6
...,...,...,...,...,...,...
Mycobacteriaceae,Mycobacterium smegmatis str. MC2 155,6.9,7000,471,7004,67.7
Mycobacteriaceae,Mycobacterium bovis AF2122/97,4.3,4000,257,3997,65.6
Mycobacteriaceae,Mycobacterium bovis BCG str. Pasteur 1173P2,4.4,4200,267,4188,65.5
Mycobacteriaceae,Mycobacterium leprae Br4923,3.3,1600,50,1604,57.8


In [70]:
fullBacteriaData.reset_index(inplace=True)
fullBacteriaData["Genus"]

0        Escherichia
1       Streptomyces
2           Bacillus
3        Pseudomonas
4      Mycobacterium
           ...      
149    Mycobacterium
150    Mycobacterium
151    Mycobacterium
152    Mycobacterium
153    Mycobacterium
Name: Genus, Length: 154, dtype: object

In [71]:
fullBacteriaData[["Genus", "Family"]]

Unnamed: 0,Genus,Family
0,Escherichia,Enterobacteriaceae
1,Streptomyces,Streptomycetaceae
2,Bacillus,Bacillaceae
3,Pseudomonas,Pseudomonadaceae
4,Mycobacterium,Mycobacteriaceae
...,...,...
149,Mycobacterium,Mycobacteriaceae
150,Mycobacterium,Mycobacteriaceae
151,Mycobacterium,Mycobacteriaceae
152,Mycobacterium,Mycobacteriaceae


In [76]:
fullBacteriaData.reset_index(inplace=True)


In [81]:
fullBacteriaData.set_index("Bacterium", inplace=True)
fullBacteriaData.loc["Escherichia coli K-12"]

Genus                                     Escherichia
Genome Size (Mbp)                                 4.6
Number of Genes                                  4400
Number of Transcription Factors                   314
Number of Proteins                               4290
Family                             Enterobacteriaceae
GC Content (%)                                   50.8
Name: Escherichia coli K-12, dtype: object

In [82]:
print(fullBacteriaData.loc["Escherichia coli K-12"][0])

Escherichia


  print(fullBacteriaData.loc["Escherichia coli K-12"][0])


In [83]:
fullBacteriaData.iloc[0]

Genus                                     Escherichia
Genome Size (Mbp)                                 4.6
Number of Genes                                  4400
Number of Transcription Factors                   314
Number of Proteins                               4290
Family                             Enterobacteriaceae
GC Content (%)                                   50.8
Name: Escherichia coli K-12, dtype: object

In [84]:
print(fullBacteriaData.iloc[0][1])

4.6


  print(fullBacteriaData.iloc[0][1])


In [85]:
index_list = list(range(20,30))

fullBacteriaData.iloc[index_list]

Unnamed: 0_level_0,Genus,Genome Size (Mbp),Number of Genes,Number of Transcription Factors,Number of Proteins,Family,GC Content (%)
Bacterium,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ralstonia solanacearum GMI1000,Ralstonia,5.8,5400,428,5385,Burkholderiaceae,67.0
Xanthomonas campestris ATCC 33913,Xanthomonas,5.2,4900,340,4901,Xanthomonadaceae,65.0
Bartonella henselae Houston-1,Bartonella,1.9,1800,48,1754,Bartonellaceae,39.0
Legionella pneumophila Paris,Legionella,3.4,3000,184,2969,Legionellaceae,38.4
Yersinia pestis CO92,Yersinia,4.6,4200,282,4201,Enterobacteriaceae,47.6
Brucella abortus 2308,Brucella,3.3,3200,110,3214,Brucellaceae,57.0
Bordetella pertussis Tohama I,Bordetella,4.1,3800,203,3861,Alcaligenaceae,67.0
Treponema pallidum Nichols,Treponema,1.1,1000,25,1010,Spirochaetaceae,52.8
Chlamydia pneumoniae CWL029,Chlamydia,1.2,1100,19,1112,Chlamydiaceae,42.1
Synechocystis PCC 6803,Synechocystis,3.6,3400,247,3381,Synechococcaceae,47.7


In [86]:
print(index_list)

[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [87]:
print(fullBacteriaData["Genus"].head())
print(fullBacteriaData["Genus"].tail())

Bacterium
Escherichia coli K-12                 Escherichia
Streptomyces coelicolor A3(2)        Streptomyces
Bacillus subtilis 168                    Bacillus
Pseudomonas aeruginosa PAO1           Pseudomonas
Mycobacterium tuberculosis H37Rv    Mycobacterium
Name: Genus, dtype: object
Bacterium
Mycobacterium smegmatis str. MC2 155           Mycobacterium
Mycobacterium bovis AF2122/97                  Mycobacterium
Mycobacterium bovis BCG str. Pasteur 1173P2    Mycobacterium
Mycobacterium leprae Br4923                    Mycobacterium
Mycobacterium tuberculosis CDC1551             Mycobacterium
Name: Genus, dtype: object


In [88]:
print(fullBacteriaData["Genus"].tail(7))

Bacterium
Mycobacterium vanbaalenii PYR-1                Mycobacterium
Mycobacterium gilvum PYR-GCK                   Mycobacterium
Mycobacterium smegmatis str. MC2 155           Mycobacterium
Mycobacterium bovis AF2122/97                  Mycobacterium
Mycobacterium bovis BCG str. Pasteur 1173P2    Mycobacterium
Mycobacterium leprae Br4923                    Mycobacterium
Mycobacterium tuberculosis CDC1551             Mycobacterium
Name: Genus, dtype: object


In [90]:
fullBacteriaData["Genus"].value_counts().head(10)

Genus
Mycobacterium             26
Helicobacter               4
Candidatus Blochmannia     3
Leptospira                 3
Burkholderia               3
Mycoplasma                 3
Bacteroides                3
Streptococcus              3
Nitrospira                 3
Escherichia                2
Name: count, dtype: int64

In [99]:
fullBacteriaData

Unnamed: 0_level_0,Bacterium,Genome_Size,Genes,Transcription_Factors,Proteins,Family,GC
Genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Escherichia,Escherichia coli K-12,4.6,4400,314,4290,Enterobacteriaceae,50.8
Streptomyces,Streptomyces coelicolor A3(2),8.7,7800,965,7825,Streptomycetaceae,72.1
Bacillus,Bacillus subtilis 168,4.2,4100,158,4103,Bacillaceae,43.5
Pseudomonas,Pseudomonas aeruginosa PAO1,6.3,5500,505,5570,Pseudomonadaceae,66.6
Mycobacterium,Mycobacterium tuberculosis H37Rv,4.4,4000,189,4000,Mycobacteriaceae,65.6
...,...,...,...,...,...,...,...
Mycobacterium,Mycobacterium smegmatis str. MC2 155,6.9,7000,471,7004,Mycobacteriaceae,67.7
Mycobacterium,Mycobacterium bovis AF2122/97,4.3,4000,257,3997,Mycobacteriaceae,65.6
Mycobacterium,Mycobacterium bovis BCG str. Pasteur 1173P2,4.4,4200,267,4188,Mycobacteriaceae,65.5
Mycobacterium,Mycobacterium leprae Br4923,3.3,1600,50,1604,Mycobacteriaceae,57.8


In [101]:
averageGC = fullBacteriaData["GC"].mean()
print(f"The average GC content of the dataset is {round(averageGC,2)}%")

GenomeSize = fullBacteriaData["Genome_Size"]
print(GenomeSize.head(5))
GenomeSize += 5
print(GenomeSize.head(5))

The average GC content of the dataset is 48.76%
Genus
Escherichia      4.6
Streptomyces     8.7
Bacillus         4.2
Pseudomonas      6.3
Mycobacterium    4.4
Name: Genome_Size, dtype: float64
Genus
Escherichia       9.6
Streptomyces     13.7
Bacillus          9.2
Pseudomonas      11.3
Mycobacterium     9.4
Name: Genome_Size, dtype: float64


<h3>GroupBy:</h3>

In [114]:
fullBacteriaData#.groupby("Genus").groups.keys()

Unnamed: 0,Genus,Bacterium,Genome_Size,Genes,Transcription_Factors,Proteins,Family,GC
0,Escherichia,Escherichia coli K-12,9.6,4400,314,4290,Enterobacteriaceae,50.8
1,Streptomyces,Streptomyces coelicolor A3(2),13.7,7800,965,7825,Streptomycetaceae,72.1
2,Bacillus,Bacillus subtilis 168,9.2,4100,158,4103,Bacillaceae,43.5
3,Pseudomonas,Pseudomonas aeruginosa PAO1,11.3,5500,505,5570,Pseudomonadaceae,66.6
4,Mycobacterium,Mycobacterium tuberculosis H37Rv,9.4,4000,189,4000,Mycobacteriaceae,65.6
...,...,...,...,...,...,...,...,...
149,Mycobacterium,Mycobacterium smegmatis str. MC2 155,11.9,7000,471,7004,Mycobacteriaceae,67.7
150,Mycobacterium,Mycobacterium bovis AF2122/97,9.3,4000,257,3997,Mycobacteriaceae,65.6
151,Mycobacterium,Mycobacterium bovis BCG str. Pasteur 1173P2,9.4,4200,267,4188,Mycobacteriaceae,65.5
152,Mycobacterium,Mycobacterium leprae Br4923,8.3,1600,50,1604,Mycobacteriaceae,57.8


In [117]:
fullBacteriaData.groupby("Family")["GC"].mean()

Family
Actinomycetaceae        66.0
Aeromonadaceae          61.0
Alcaligenaceae          67.0
Anaplasmataceae         34.0
Aquificaceae            43.0
                        ... 
Thermodesulfobiaceae    61.0
Thermoplasmataceae      54.0
Thermotogaceae          46.2
Vibrionaceae            47.7
Xanthomonadaceae        65.0
Name: GC, Length: 73, dtype: float64

In [121]:
fullBacteriaData.groupby(["Family", "Bacterium"])["GC"].mean()

Family                Bacterium                                           
Actinomycetaceae      Actinomyces naeslundii MG1                              67.0
                      Actinomyces odontolyticus ATCC 17982                    65.0
Aeromonadaceae        Aeromonas hydrophila subsp. hydrophila ATCC 7966        61.0
Alcaligenaceae        Bordetella pertussis Tohama I                           67.0
Anaplasmataceae       Wolbachia endosymbiont of Culex quinquefasciatus Pel    34.0
                                                                              ... 
Thermodesulfobiaceae  Thermodesulfobium narugense DSM 14796                   61.0
Thermoplasmataceae    Thermoplasma acidophilum DSM 1728                       54.0
Thermotogaceae        Thermotoga maritima MSB8                                46.2
Vibrionaceae          Vibrio cholerae N16961                                  47.7
Xanthomonadaceae      Xanthomonas campestris ATCC 33913                       65.0
Name: GC, Le

In [124]:
print(fullBacteriaData["GC"]>60)

highGC = fullBacteriaData[fullBacteriaData["GC"]>60]
print(highGC)

0      False
1       True
2      False
3       True
4       True
       ...  
149     True
150     True
151     True
152    False
153     True
Name: GC, Length: 154, dtype: bool
             Genus                                    Bacterium  Genome_Size  \
1     Streptomyces                Streptomyces coelicolor A3(2)         13.7   
3      Pseudomonas                  Pseudomonas aeruginosa PAO1         11.3   
4    Mycobacterium             Mycobacterium tuberculosis H37Rv          9.4   
16     Rhodobacter                Rhodobacter sphaeroides 2.4.1          9.6   
20       Ralstonia               Ralstonia solanacearum GMI1000         10.8   
..             ...                                          ...          ...   
148  Mycobacterium                 Mycobacterium gilvum PYR-GCK         11.1   
149  Mycobacterium         Mycobacterium smegmatis str. MC2 155         11.9   
150  Mycobacterium                Mycobacterium bovis AF2122/97          9.3   
151  Mycobacterium  My

In [125]:
sorting = fullBacteriaData.sort_values(["Genome_Size"], ascending = False)

sorting.head()

Unnamed: 0,Genus,Bacterium,Genome_Size,Genes,Transcription_Factors,Proteins,Family,GC
46,Bradyrhizobium,Bradyrhizobium japonicum USDA 110,14.1,8300,621,8213,Bradyrhizobiaceae,64.1
30,Nostoc,Nostoc punctiforme PCC 73102,14.0,8100,590,8023,Nostocaceae,41.5
1,Streptomyces,Streptomyces coelicolor A3(2),13.7,7800,965,7825,Streptomycetaceae,72.1
95,Burkholderia,Burkholderia cenocepacia AU 1054,13.1,7500,524,7473,Burkholderiaceae,66.3
31,Rhizobium,Rhizobium leguminosarum bv. viciae 3841,12.4,7000,420,7055,Rhizobiaceae,60.5


<div style="padding-top:10px; padding-bottom:10px;">
    <h3>Saving the data into a file</h3>
    <p>To save the information of only three columns (Genus, Genome size and GC content) to a text file, we can use the to_csv() method as follows:</p>
</div>

In [126]:
shortDataframe = fullBacteriaData[["Genus", "Genome_Size", "GC"]]

shortDataframe.to_csv("Datasets/bacteria_short.csv")

<div style="padding-top:10px; padding-bottom:10px;">
    <h3>Concatenate, merge and join</h3>
    <p>In any real world data science situation you’ll have to merge or concatenate Dataframes together to manipulate, filter or generate your dataset.</p>
    <p>Merging, joining and concatenating are core processes that any aspiring computational biologist will need to master.</p>
    <p>Pandas provides methods to combine Series or DataFrame with weird stuff, ranging from Boolean logic to relational algebra to perform join and merge operations</p>
    <h3>Concatenating:</h3>
    <p>The concat() function will concatenate stuff along an axis while performing optional set logic (union or intersection) of the indexes on the other axes. 
</p>
    <p><center><img src="Figures/concat.png"></center></p>
</div>


In [3]:
#First, let's create a mock dataset:
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index = [0,1,2,3],
)

df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    },
    index = [4,5,6,7],
)
df1


Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [4]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [5]:
#Now let's concatenate

pd.concat([df1, df2])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


<div style="padding-top:10px; padding-bottom:10px;">

</div>

<div style="padding-top:10px; padding-bottom:10px;">

</div>

<div style="padding-top:10px; padding-bottom:10px;">

</div>

<div style="padding-top:10px; padding-bottom:10px;">
    <h3>Merging:</h3>
    <p>Merging is a convenient method for combining the columns of two Dataframes.</p>
    <p>In Pandas, there are separate “merge” and “join” functions, both of which do similar things.</p>
    <p>“Merging” two datasets is the process of bringing two datasets together into one, and aligning the rows from each based on common attributes or columns.</p>
    <p>“Joining” combines two dataframes on the basis of their indexes.</p>
</div>

<div style="padding-top:10px; padding-bottom:10px;margin:auto;">
    <h4>There are four different types of merges available in Pandas:</h4>
    <div style="display:flex;flex-direction:row; padding-left:10%;">
    <ul style="width:20%;">
        <li>Inner merge</li>
        <li>Outer merge</li>
        <li>Right merge</li>
        <li>Left merge</li>
    </ul>
        <p><img src="Figures/merging.png" style="width:70%;"></p>
    </div>
    <p>The merge type to use is specified using the “how” parameter in the merge command, taking values “left”, “right”, “inner” (default), or “outer”.</p>
</div>

In [14]:
df1 = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3", "K4", "K5"],
        "A": ["A0", "A1", "A2", "A3", "A4", "A5"],
        "B": ["B0", "B1", "B2", "B3", "B4", "B5"],
    },
    index = [0,1,2,3,4,5],
)

df2 = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index = [6,7,8,9],
)
df1

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3
4,K4,A4,B4
5,K5,A5,B5


In [8]:
df2

Unnamed: 0,key,C,D
4,K0,C0,D0
5,K1,C1,D1
6,K2,C2,D2
7,K3,C3,D3


In [15]:
#Inner merge
pd.merge(df1, df2, how="inner")

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [16]:
#outer merge
pd.merge(df1, df2, how="outer")

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3
4,K4,A4,B4,,
5,K5,A5,B5,,


In [17]:
#left merge
pd.merge(df1, df2, how='left')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3
4,K4,A4,B4,,
5,K5,A5,B5,,


In [19]:
#right merge
pd.merge(df1, df2, how='right')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


<div style="padding-top:10px; padding-bottom:10px;">
    <h3>In Pandas, missing Data is represented by two values:</h3>
    <p><strong>None</strong>:a Python singleton often used for missing data in Python code.</p>
    <p><strong>NaN (Not a number)</strong>:A floating point value recognized by many systems.</p>
    <p>They are virtually the same, they mean that there is a value is missing.</p>
    <h4>Using isna()/isnull():</h4>
</div>

In [21]:
import numpy as np
dict = {'First Score':[100,90,np.nan,95],
        'Second Score':[30,45,56,np.nan],
        'Third Score':[np.nan,40,80,98]}

df = pd.DataFrame(dict)
df.isna()

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


<h4>Using notna()/notnull()</h4>

In [23]:
df.notna()

Unnamed: 0,First Score,Second Score,Third Score
0,True,True,False
1,True,True,True
2,False,True,True
3,True,False,True


<h4>Using dropna():</h4>

In [24]:
print(df)
print("")
print(df.dropna())
print("")
print(df.dropna(how="all"))

   First Score  Second Score  Third Score
0        100.0          30.0          NaN
1         90.0          45.0         40.0
2          NaN          56.0         80.0
3         95.0           NaN         98.0

   First Score  Second Score  Third Score
1         90.0          45.0         40.0

   First Score  Second Score  Third Score
0        100.0          30.0          NaN
1         90.0          45.0         40.0
2          NaN          56.0         80.0
3         95.0           NaN         98.0


<h4>Using fillna():</h4>

In [25]:
df.fillna(0.5)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.5
1,90.0,45.0,40.0
2,0.5,56.0,80.0
3,95.0,0.5,98.0


<h4>Using replace():</h4>

In [26]:
df.replace(to_replace = np.nan, value = 0.5)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.5
1,90.0,45.0,40.0
2,0.5,56.0,80.0
3,95.0,0.5,98.0


<p><center><img src="Figures/dropna.png"></center></p>

<div style="padding-top:10px; padding-bottom:10px;">
    <h2>Data Standardization:</h2>
    <p>Data standardization is the data quality process of transforming data to fit a predefined and constrained set of values.</p>
    <p>Basically, we change the scale to compare datapoints from different samples.</p>
    <p>It is crucial for quantitative analysis for methodologies such as Transcriptomics, Metabolomics and Metatranscriptomics, as well as Genomics and Metagenomics.</p>
    <p>Examples in Biology are:</p>
    <ul>
        <li><strong>RPKM</strong></li>
        <li><strong>FPKM</strong></li>
        <li><strong>TPM</strong></li>
        <li><strong>RPKG</strong></li>
        <li><strong>CODA</strong></li>
    </ul>
    <p>Since our time is short, we will focus on the CODA part!</p>
</div>

<div style="padding-top:10px; padding-bottom:10px;">
    <h2>Data Standardization:</h2>
    <p>Data standardization is the data quality process of transforming data to fit a predefined and constrained set of values.</p>
    <p>Basically, we change the scale to compare datapoints from different samples.</p>
    <p>It is crucial for quantitative analysis for methodologies such as Transcriptomics, Metabolomics and Metatranscriptomics, as well as Genomics and Metagenomics.</p>
    <p>Examples in Biology are:</p>
    <ul>
        <li><strong>RPKM</strong></li>
        <li><strong>FPKM</strong></li>
        <li><strong>TPM</strong></li>
        <li><strong>RPKG</strong></li>
        <li><strong>CODA</strong></li>
    </ul>
    <p>Since our time is short, we will focus on the CODA part!</p>
</div>

<div style="padding-top:10px; padding-bottom:10px;">
    <h3>CODA:</h3>
    <p>16s is limited when compared to Deep sequencing samples, but it has its own value.</p>
    <p>However, 16s counts doesn’t make any sense if they do not consider all counts.</p>
    <p>That’s why we normally transform 16s data into fractions or percentages.</p>
    <p><center><img src = "Figures/barplot.png" style="width:30%;"></center></p>
    
</div>

<div style="padding-top:10px; padding-bottom:10px;margin:auto;">
    <h4>Linear transformations:</h4>
    <div style="display:flex;flex-direction:row;">
    <div>
        <h5>Three log ratio transformations available:</h5>
    <ul>
        <li>Additive log-ratio (ALR)</li>
        <li>Centred log-ration (CLR)</li>
        <li>Isometric log-ration (ILR)</li>
    </ul>
        <h5>All three satisfies linearity, but they are not the same thing.</h5>
    <ul>
        <li>Sub compositional coherence</li>
        <li>Scale invariance</li>
    </ul>
    </div>
        <div style="padding-left:10%;">
        <p><img src="Figures/linear_transformation.png" style="width:70%;"></p>
    </div>
    
</div>

<div style="padding-top:10px; padding-bottom:10px;">
    <h3>ALR:</h3>
    <p>ALR is one of the simplest transformations.</p>
    <p>You will arbitrarily choose a composition common to all samples (maybe the one with less variation) as reference (ref).</p>
    <p>You will divide all the other compositions by this particular composition and calculate the log.</p>
    <p><center><img src = "Figures/alr.png" style="width:30%;"></center></p>
    
</div>






In [28]:
import pandas as pd
import random
import numpy as np
import skbio
from scipy.spatial.distance import pdist, squareform

microbiomeDF = pd.read_table("Datasets/Bacteria_counts.tsv", index_col=0)
taxonomy = pd.read_table("Datasets/Bacteria_taxonomy.tsv", index_col=0)
metadata = pd.read_table("Datasets/Metadata.tsv", index_col=0)


In [57]:
def composition(CountsDataFrame):
    """
    Transform the counts table into compositions:

    Args:
    _____
    DataFrame (int): Dataframe containing count values
    options:
    freq - Transform the table into frequencies
    perc - Transform the table into percentages

    Return:
    _______
        Dataframe containing compositional data
    """

    CountsDataFrame = CountsDataFrame.replace(0,1)
    if validate_counts(CountsDataFrame) ==False:
        raise DataFrameValidationError
    else:
        return(CountsDataFrame.div(CountsDataFrame.sum(axis=0))*100)

def validate_counts(DataFrame):
    """
    Validate the input to check if the sum of components is
    
    different from a pre-defined upper-limit (1 or 100).
    Args:
    _____
    DataFrame (int): Dataframe containing count values

    Return:
    _______
        Boolean variable
    """
    return(all(round(DataFrame.sum(),1)!=1.0) and all(DataFrame.sum()!=100.0))

def validate_composition(DataFrame):
    """
    Validate the input to check if the sum of components is 1 or 100.
    
    Args:
    _____
    DataFrame (float): Dataframe containing compositional data

    Return:
    _______
        Boolean variable
    """
    return(all(round(DataFrame.sum(),1)==1.0) or all(round(DataFrame.sum(),1)==100.0))


def clr(DataFrame):
    """
    Transform the compositional table into Centered log-ratios
    CLR = log(Composition/geometric mean)

    Args:
    _____
    DataFrame (float): Dataframe containing compositional data.

    Return:
    _______
        CLR transformed dataframe
    """
    if validate_composition(DataFrame) ==False:
        raise DataFrameValidationError
    else:
        gmeans = np.log(DataFrame).mean()
        clr = np.log(DataFrame)-gmeans
        return(clr)

def alr(DataFrame, ref):
    """
    Transform the compositional table into Aitchison log-ratios    
    ALR = log(Composition/reference)

    Args:
    _____
    DataFrame (float): Dataframe containing compositional data.
    ref (str): Composition being used as reference for the log-ratio.
    Return:
    _______
        ALR transformed dataframe
    """
    if validate_composition(DataFrame) ==False:
        raise DataFrameValidationError
    else:
        reference = DataFrame.T[ref]
        alr = np.log(DataFrame.T.div(reference, axis=0))
        return(alr.drop(ref,axis=1).T)


def Aitchison_Distance(Dataframe):
    """
    Calculate the euclidian distance between samples based on CLR transformed data.

    Args:
    _____
    DataFrame (float): Dataframe containing clr-transformed data.
    
    Return:
    _______
    Dataframe containing the euclidian distance between samples.
    """
    aitchison_distance = squareform(pdist(Dataframe))
    aitchison_distance = pd.DataFrame(aitchison_distance, index=clr_asv.index, columns=clr_asv.index)
    return aitchison_distance

In [54]:
df_composition = composition(microbiomeDF)
alr_transformed = alr(df_composition, "ASV_3")
alr_transformed

Unnamed: 0,1462,1464,1479,1485,1494,1495,1496,1498,1501,1503,...,500,502,505,506,510,511,514,515,517,523
ASV_990,-5.945421,-7.767264,-5.420535,-2.564949,-7.671827,-6.675823,-7.725330,-2.443521,-6.291569,-7.680176,...,-1.305325,-8.007700,-6.400257,-3.255832,-6.642487,-3.391988,-2.942488,-4.672829,-6.520621,-2.728786
ASV_1676,-5.945421,-4.211916,-5.420535,-2.564949,-7.671827,-6.675823,-7.725330,-6.054439,-6.291569,-7.680176,...,-6.498282,-8.007700,-6.400257,-6.782192,-6.642487,-5.471430,-1.643205,-4.672829,-5.422009,-6.284134
ASV_1533,-3.237370,-5.059214,-5.420535,-2.564949,-4.116479,-2.763800,-7.725330,-6.054439,-6.291569,-5.482951,...,-6.498282,-8.007700,-3.915351,-3.486355,-6.642487,-7.080868,-4.446565,-4.672829,-5.134327,-6.284134
ASV_3142,-5.945421,-7.767264,-1.706963,-2.564949,-7.671827,-3.497769,-4.429493,-6.054439,-3.247047,-4.735737,...,-6.498282,-8.007700,-6.400257,-6.782192,-6.642487,-7.080868,-6.238325,-4.672829,-6.520621,-6.284134
ASV_3117,-5.945421,-7.767264,-5.420535,-2.564949,-7.671827,-6.675823,-7.725330,-6.054439,-6.291569,-7.680176,...,-6.498282,-8.007700,-6.400257,-6.782192,-6.642487,-7.080868,-6.238325,-4.672829,-6.520621,-6.284134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ASV_2087,-5.945421,-7.767264,-5.420535,-2.564949,-7.671827,-6.675823,-7.725330,-6.054439,-6.291569,-7.680176,...,-6.498282,-3.392579,-6.400257,-6.782192,-6.642487,-7.080868,-6.238325,-4.672829,-6.520621,-6.284134
ASV_2574,-5.252273,-7.767264,-5.420535,-2.564949,-7.671827,-6.675823,-7.725330,-6.054439,-6.291569,-7.680176,...,-6.498282,-6.061790,-6.400257,-6.782192,-6.642487,-6.387721,-3.530274,-4.672829,-2.759421,-5.590987
ASV_4458,-5.945421,-7.767264,-5.420535,-2.564949,-7.671827,-6.675823,-7.725330,-6.054439,-6.291569,-7.680176,...,-6.498282,-8.007700,-6.400257,-6.782192,-6.642487,-7.080868,-3.405111,-4.672829,-6.520621,-6.284134
ASV_3992,-5.945421,-7.767264,-5.420535,-2.564949,-7.671827,-6.675823,-7.725330,-6.054439,-6.291569,-7.680176,...,-4.195697,-8.007700,-6.400257,-6.782192,-6.642487,-7.080868,-6.238325,-4.672829,-6.520621,-6.284134


<div style="padding-top:10px; padding-bottom:10px;">
    <p><center><img src = "Figures/ALR_crow.png" style="width:50%;"></center></p>
    
</div>

<h3>CLR</h3>

In [56]:
clr_transformed = clr(df_composition)
clr_transformed

Unnamed: 0,1462,1464,1479,1485,1494,1495,1496,1498,1501,1503,...,500,502,505,506,510,511,514,515,517,523
ASV_990,-0.652532,-0.667195,-0.761346,-0.679041,-0.691098,-0.941994,-1.022265,3.052450,-1.035643,-0.615137,...,4.279574,-1.081047,-1.147471,2.900090,-0.743095,2.762573,2.281283,-0.824084,-1.067907,2.751314
ASV_1676,-0.652532,2.888153,-0.761346,-0.679041,-0.691098,-0.941994,-1.022265,-0.558468,-1.035643,-0.615137,...,-0.913383,-1.081047,-1.147471,-0.626271,-0.743095,0.683131,3.580566,-0.824084,0.030706,-0.804034
ASV_1533,2.055518,2.040856,-0.761346,-0.679041,2.864250,2.970029,-1.022265,-0.558468,-1.035643,1.582088,...,-0.913383,-1.081047,1.337435,2.669566,-0.743095,-0.926307,0.777206,-0.824084,0.318388,-0.804034
ASV_3142,-0.652532,-0.667195,2.952226,-0.679041,-0.691098,2.236059,2.273572,-0.558468,2.008879,2.329302,...,-0.913383,-1.081047,-1.147471,-0.626271,-0.743095,-0.926307,-1.014554,-0.824084,-1.067907,-0.804034
ASV_3117,-0.652532,-0.667195,-0.761346,-0.679041,-0.691098,-0.941994,-1.022265,-0.558468,-1.035643,-0.615137,...,-0.913383,-1.081047,-1.147471,-0.626271,-0.743095,-0.926307,-1.014554,-0.824084,-1.067907,-0.804034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ASV_2087,-0.652532,-0.667195,-0.761346,-0.679041,-0.691098,-0.941994,-1.022265,-0.558468,-1.035643,-0.615137,...,-0.913383,3.534074,-1.147471,-0.626271,-0.743095,-0.926307,-1.014554,-0.824084,-1.067907,-0.804034
ASV_2574,0.040615,-0.667195,-0.761346,-0.679041,-0.691098,-0.941994,-1.022265,-0.558468,-1.035643,-0.615137,...,-0.913383,0.864863,-1.147471,-0.626271,-0.743095,-0.233160,1.693496,-0.824084,2.693293,-0.110887
ASV_4458,-0.652532,-0.667195,-0.761346,-0.679041,-0.691098,-0.941994,-1.022265,-0.558468,-1.035643,-0.615137,...,-0.913383,-1.081047,-1.147471,-0.626271,-0.743095,-0.926307,1.818660,-0.824084,-1.067907,-0.804034
ASV_3992,-0.652532,-0.667195,-0.761346,-0.679041,-0.691098,-0.941994,-1.022265,-0.558468,-1.035643,-0.615137,...,1.389202,-1.081047,-1.147471,-0.626271,-0.743095,-0.926307,-1.014554,-0.824084,-1.067907,-0.804034


<div style="padding-top:10px; padding-bottom:10px;">
    <p><center><img src = "Figures/CLR_CROW.png" style="width:50%;"></center></p>
    
</div>