# K-Means Clustering of microbiome organisms
## <font color=green>Team Organism</font>

In [49]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
%matplotlib inline

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## Read in data
1000 rows, one row for each microorganism found. Each organism is given a name, such as OTU_12 or OTU_135. <br>
- dataframe contains all the collums in a panda datafram<br>
- names is just the organism names in 1000 rows<br>
- rna is just their rna id<br>
- tax is a number of collumns identifing their phylum, order, genus and species<br>
- data has a column for each location, AK001 - AK101, with a measure of the population of this organism at each location. <br>


In [50]:
dataframe = pd.read_csv("taxonomy_logvals_top1000.csv")
names = dataframe.iloc[:,0:1]
rna = dataframe.iloc[:,1:2]
tax = dataframe.iloc[:,2:9]
data = dataframe.iloc[:,9:109]


In [51]:
print (data)

        AK001     AK002     AK003     AK004     AK005     AK006     AK007  \
0    4.417289  4.160948  4.009578  2.937016  3.198657  3.046495  3.197281   
1    3.766710  3.547775  2.346353  3.072250  3.014100  3.196453  2.906335   
2    2.633468  2.235528  3.521269  3.415140  1.770852  2.453318  4.016616   
3    3.795115  3.385964  3.508664  2.012837  2.953760  2.468347  2.739572   
4    2.611723  3.249443  3.333447  2.725912  2.739572  2.477121  2.089905   
5    3.885813  3.449787  2.053078  1.591065  2.777427  2.521138  2.522444   
6    2.824126  1.707570  1.531479  1.230449  1.041393  2.117271  1.414973   
7    3.471585  3.175802  3.152288  1.963788  2.227887  2.136721  2.167317   
8    2.245513  1.944483  3.157759  3.629206  1.414973  1.959041  2.961895   
9    2.740363  2.640481  1.342423  2.146128  2.075547  2.283301  2.017033   
10   3.275772  1.944483  3.074085  3.794697  2.146128  2.287802  2.552668   
11   2.745855  2.582063  1.278754  2.143015  1.908485  2.222716  1.944483   

## Do the clustering using KMeans
Set the number of clusters with the n_clusters=10 parameter <br>
labels is set to the cluster number or label for each row in the data

In [52]:
# do the clustering
#k_means = sklearn.cluster.KMeans(n_clusters=3)
k_means = KMeans(n_clusters=10)           # imported above
k_means.fit(data) 
labels = k_means.labels_

In [53]:
print (labels)

[8 8 8 8 8 8 8 8 8 8 8 8 2 2 2 2 2 2 9 9 9 9 9 9 2 9 2 2 9 2 2 2 2 2 9 2 2
 9 2 9 2 2 9 2 9 2 2 2 4 2 2 2 9 2 2 2 9 2 2 9 2 9 9 9 4 9 2 2 2 4 2 2 4 4
 3 0 2 2 2 7 9 4 2 4 2 4 9 9 9 7 7 2 9 4 9 4 0 2 2 4 9 7 4 4 7 9 7 7 9 9 7
 3 7 9 9 7 9 9 7 9 4 4 4 9 2 4 4 7 0 4 3 4 4 7 6 7 3 4 9 0 9 3 0 4 0 4 7 0
 4 4 4 4 4 6 0 4 7 0 7 4 6 4 0 4 4 0 0 4 0 4 4 4 0 4 3 4 7 0 4 7 7 3 0 7 0
 6 0 7 7 0 6 6 0 4 0 0 0 0 0 0 0 7 6 4 4 7 0 0 7 4 0 4 4 0 0 4 3 7 0 0 0 0
 0 3 3 7 0 0 6 0 0 4 6 7 4 0 4 4 0 0 4 0 6 0 0 7 3 6 0 4 0 0 7 3 7 7 4 4 4
 6 7 4 4 0 6 0 0 7 0 3 0 4 7 0 3 0 6 0 6 4 6 0 3 0 0 0 4 0 0 0 3 4 0 0 6 0
 4 4 4 1 0 6 7 4 6 1 3 0 0 4 7 0 7 0 0 3 6 3 0 0 1 6 0 6 0 6 0 6 0 0 0 3 6
 3 3 0 0 3 0 6 6 0 6 1 7 1 0 0 0 0 3 0 7 1 0 6 6 0 4 6 4 0 0 0 6 0 0 0 5 6
 0 0 4 1 0 7 0 3 0 0 0 7 5 3 1 1 4 0 3 0 0 6 3 1 0 6 7 3 5 0 3 3 1 6 3 3 0
 0 6 1 6 3 6 6 6 1 6 6 3 4 1 6 6 1 6 1 5 5 6 0 5 1 1 0 0 6 6 1 1 3 6 0 0 1
 5 0 1 1 3 0 0 6 0 5 3 1 3 5 3 6 1 5 1 6 1 7 3 3 1 1 1 5 0 5 1 5 7 0 3 0 5
 1 7 7 3 6 6 0 1 1 1 3 0 

## Analyze the clusters
The data is stored in panda dataframes. For more information on these see __[Working with DataFrames](http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes)__ by Greg Reda. <br>
The 'category' column is added to the maindata frame and contains the cluster number from KMeans.

In [54]:
dataframe['cluster'] = labels

## Some ways to count values in each cluster

In [55]:
print ("The number of items in each cluster using the numpy array")
np.bincount(labels)

The number of items in each cluster using the numpy array


array([136, 169,  46, 135,  73, 217, 126,  48,  12,  38], dtype=int64)

In [56]:
dataframe['cluster'].value_counts()

5    217
1    169
0    136
3    135
6    126
4     73
7     48
2     46
9     38
8     12
Name: cluster, dtype: int64

## Some ways to display all the items in a cluster

In [57]:
print ("All the data items in cluster 8")
cluster8 = dataframe.loc[dataframe['cluster'] == 8]
print (cluster8.iloc[:,0:1])

All the data items in cluster 8
     otu name
0      OTU_12
1      OTU_11
2       OTU_5
3       OTU_9
4      OTU_45
5      OTU_23
6      OTU_52
7   OTU_12993
8   OTU_11397
9   OTU_12053
10     OTU_14
11  OTU_17200


In [58]:
print ("All the data items in cluster 8")
dataframe.loc[dataframe['cluster'] == 8]

All the data items in cluster 8


Unnamed: 0,otu name,16rRNA,Kingdom,Phylum,Class,Order,Family,Genus,Species,AK001,...,AK093,AK094,AK095,AK096,AK097,AK098,AK099,AK100,AK101,cluster
0,OTU_12,halospirulina sp. ef17(2012),k__bacteria,p__cyanobacteria,c__cyanobacteria,o__oscillatoriales,f__oscillatoriales,g__halospirulina,s__halospirulina sp.,4.417289,...,2.91698,2.322219,3.002166,2.436163,3.504878,0.0,0.0,3.850646,3.542327,8
1,OTU_11,gq001873.1 topographical and temporal human sk...,k__bacteria,p__firmicutes,c__bacilli,o__bacillales,f__staphylococcaceae,g__staphylococcus,s__staphylococcus epidermidis,3.76671,...,2.546543,2.800717,3.543199,2.868644,3.765147,0.0,0.0,2.71433,3.098298,8
2,OTU_5,ay281086.1 streptococcus sanguinis str. atcc 4...,k__bacteria,p__firmicutes,c__bacilli,o__lactobacillales,f__streptococcaceae,g__streptococcus,s__streptococcus sanguinis,2.633468,...,0.845098,1.748188,2.20412,1.908485,2.155336,0.0,0.0,2.829947,1.278754,8
3,OTU_9,dq178233.1 pseudomonas putida str. pc36,k__bacteria,p__proteobacteria,c__gammaproteobacteria,o__pseudomonadales,f__pseudomonadaceae,g__pseudomonas,s__pseudomonas putida,3.795115,...,1.342423,1.838849,1.322219,2.068186,1.826075,0.0,0.0,1.414973,2.947434,8
4,OTU_45,acidovorax anthurii,k__bacteria,p__proteobacteria,c__betaproteobacteria,o__burkholderiales,f__comamonadaceae,g__acidovorax,s__acidovorax anthurii,2.611723,...,1.740363,1.80618,3.09691,1.591065,2.783904,0.0,0.0,3.353724,1.986772,8
5,OTU_23,delftia tsuruhatensis,k__bacteria,p__proteobacteria,c__betaproteobacteria,o__burkholderiales,f__comamonadaceae,g__delftia,s__delftia tsuruhatensis,3.885813,...,1.857332,1.880814,2.805501,1.826075,1.623249,0.0,0.0,2.889862,1.832509,8
6,OTU_52,af130918.1 pantoea agglomerans str. new*47con,k__bacteria,p__proteobacteria,c__gammaproteobacteria,o__enterobacteriales,f__enterobacteriaceae,g__pantoea,s__pantoea agglomerans,2.824126,...,2.25042,2.257679,1.623249,2.41162,3.093772,0.30103,0.30103,1.959041,2.546543,8
7,OTU_12993,halospirulina sp. ef17(2012),k__bacteria,p__cyanobacteria,c__cyanobacteria,o__oscillatoriales,f__oscillatoriales,g__halospirulina,s__halospirulina sp.,3.471585,...,1.913814,1.20412,2.082785,1.462398,2.639486,0.0,0.0,2.904716,2.656098,8
8,OTU_11397,uncultured streptococcus sp.,k__bacteria,p__firmicutes,c__bacilli,o__lactobacillales,f__streptococcaceae,g__streptococcus,s__streptococcus spp.,2.245513,...,0.954243,1.491362,1.623249,1.70757,2.598791,0.0,0.0,1.886491,1.146128,8
9,OTU_12053,gq022177.1 topographical and temporal human sk...,k__bacteria,p__firmicutes,c__bacilli,o__bacillales,f__staphylococcaceae,g__staphylococcus,s__staphylococcus hominis,2.740363,...,1.732394,1.826075,2.63749,2.021189,2.870989,0.0,0.0,1.832509,2.164353,8


In [59]:
dataframe.groupby('cluster').first()

Unnamed: 0_level_0,otu name,16rRNA,Kingdom,Phylum,Class,Order,Family,Genus,Species,AK001,...,AK092,AK093,AK094,AK095,AK096,AK097,AK098,AK099,AK100,AK101
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,OTU_6,gq077257.1 topographical and temporal human sk...,k__bacteria,p__actinobacteria,c__actinobacteria,o__actinomycetales,f__micrococcaceae,g__rothia,s__rothia mucilaginosa,2.892095,...,0.0,0.0,0.30103,0.0,0.0,1.278754,0.0,0.0,1.30103,0.477121
1,OTU_268,uncultured streptomyces sp.,k__bacteria,p__actinobacteria,c__actinobacteria,o__actinomycetales,f__streptomycetaceae,g__streptomyces,s__streptomyces spp.,0.477121,...,0.477121,0.60206,1.0,0.30103,1.230449,0.477121,0.30103,0.0,0.60206,0.778151
2,OTU_20,ay879298.1 granulicatella paradiacens str. 044...,k__bacteria,p__firmicutes,c__bacilli,o__lactobacillales,f__carnobacteriaceae,g__granulicatella,s__granulicatella paradiacens,2.537819,...,0.90309,0.30103,1.113943,1.278754,1.278754,1.838849,0.0,0.0,0.845098,0.778151
3,OTU_8,ab271740.1 kurthia zopfii str. nbrc 101529,k__bacteria,p__firmicutes,c__bacilli,o__bacillales,f__planococcaceae,g__kurthia,s__kurthia gibsonii,0.0,...,2.220108,1.342423,2.049218,2.033424,2.201397,2.127105,0.0,0.0,1.662758,1.863323
4,OTU_56,eu672854.1 biogeography and nor5/om60 clade ox...,k__bacteria,p__proteobacteria,c__gammaproteobacteria,o__gammaproteobacteria,f__gammaproteobacteria,g__congregibacter,s__congregibacter litoralis,1.322219,...,0.778151,0.845098,1.361728,0.69897,1.431364,0.30103,0.0,0.0,0.69897,0.60206
5,OTU_490,uncultured methylophilus sp.,k__bacteria,p__proteobacteria,c__betaproteobacteria,o__methylophilales,f__methylophilaceae,g__methylophilus,s__methylophilus spp.,1.176091,...,0.0,0.30103,0.477121,0.477121,0.477121,0.0,0.0,0.0,0.0,0.0
6,OTU_141,uncultured acidobacterium sp.,k__bacteria,p__acidobacteria,c__acidobacteriia,o__acidobacteriales,f__acidobacteriaceae,g__acidobacterium,s__acidobacterium spp.,1.041393,...,0.0,0.477121,1.113943,0.60206,2.178977,0.69897,0.0,0.0,0.845098,0.30103
7,OTU_40,uncultured thermacetogenium sp.,k__bacteria,p__firmicutes,c__clostridia,o__thermoanaerobacterales,f__thermoanaerobacteraceae,g__thermacetogenium,s__thermacetogenium spp.,0.845098,...,0.954243,0.60206,1.681241,0.954243,2.800717,1.041393,0.0,0.0,3.569842,1.255273
8,OTU_12,halospirulina sp. ef17(2012),k__bacteria,p__cyanobacteria,c__cyanobacteria,o__oscillatoriales,f__oscillatoriales,g__halospirulina,s__halospirulina sp.,4.417289,...,3.350442,2.91698,2.322219,3.002166,2.436163,3.504878,0.0,0.0,3.850646,3.542327
9,OTU_135,uncultured cytophaga sp.,k__bacteria,p__bacteroidetes,c__cytophagia,o__cytophagales,f__cytophagaceae,g__cytophaga,s__cytophaga spp.,1.662758,...,0.477121,0.0,1.146128,0.30103,0.69897,0.778151,0.0,0.0,0.69897,1.568202


# Findings
Use the following [Markdown](http://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Working%20With%20Markdown%20Cells.html) boxes to record your findings<br>
How many clusters did you use? How did the data items fall into the clusters?<br>
Were the clusters consistent? <br>

