This notebook will give some examples of how to read in data files and put them in data frames (using pandas), label and remove columns. 

In [2]:
# Here, I'm loading some python libraries that will be useful.

from sklearn import datasets
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

In [None]:
# Here is one method to import a data file. It will open a tab to browse for files.
# I won't use this method (so haven't run this cell), but feel free to try it. 
# The method I use is in the next cell.

from google.colab import files
uploaded = files.upload()

In [3]:
# This is another way to open files save in your Google Drive.
# Running this will prompt you to give permissions for the notebook to access your Google drive.

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# You can read in a .csv file from your Google Drive if you have run the previous cell (importing drive from google.colab) 
# and given permission for the notebook to access your Drive.

# Here, I'll read in one of the mass spec data files. pd.read uses pandas and puts the data in a dataframe, which we can give to ML methods later.
df_lime =pd.read_csv('drive/My Drive/data/MS/Lime_sample_1.csv')

# Here, I'm naming the columns of the data read from the .csv file that I've read into the dataframe names 'df_lime'
# The first column of this data file is the mz value, so let's call it 'mz_lime'.
# We will name the intensities for the 5 mass spectra as Lime1, etc.
df_lime.columns=['mz_lime','Lime1','Lime2','Lime3','Lime4','Lime5']

# You will want to read in all of the data files.
# Read another file. In this example, one of the Lemon samples.
# I'll also name the columns clearly.
df_lemon =pd.read_csv('drive/My Drive/data/MS/Lemon_sample_1.csv')
df_lemon.columns=['mz_lemon','Lemon1','Lemon2','Lemon3','Lemon4','Lemon5']

In [5]:
# We can write the data to check what it looks like.
df_lime

Unnamed: 0,mz_lime,Lime1,Lime2,Lime3,Lime4,Lime5
0,9.900000,3.566247e-06,8.163455e-06,1.801872e-06,0.000009,4.561305e-06
1,9.950000,3.566247e-06,8.163455e-06,1.801872e-06,0.000009,4.561305e-06
2,10.000000,4.353816e-06,4.522634e-06,8.955635e-07,0.000007,2.648715e-06
3,10.050000,3.822895e-06,4.290223e-06,4.590969e-06,0.000007,2.151427e-06
4,10.100000,3.455640e-06,5.472827e-06,2.643489e-06,0.000007,6.301590e-07
...,...,...,...,...,...,...
39772,1998.500000,8.229950e-07,1.585012e-06,1.003453e-06,0.000002,0.000000e+00
39773,1998.550049,7.566306e-07,2.145112e-06,1.219228e-06,0.000001,1.063408e-06
39774,1998.599976,3.157003e-06,6.555599e-07,1.774903e-06,0.000002,2.473911e-06
39775,1998.650024,0.000000e+00,7.302742e-07,1.435006e-06,0.000003,0.000000e+00


In [6]:
# and the second file that we read in.
df_lemon

Unnamed: 0,mz_lemon,Lemon1,Lemon2,Lemon3,Lemon4,Lemon5
0,9.900000,9.386688e-06,0.000005,1.040731e-05,0.000006,6.562673e-06
1,9.950000,9.386688e-06,0.000005,1.040731e-05,0.000006,6.562673e-06
2,10.000000,8.429838e-06,0.000007,1.272107e-05,0.000008,4.316771e-06
3,10.050000,9.633747e-06,0.000008,1.114547e-05,0.000007,5.319957e-06
4,10.100000,6.962591e-06,0.000005,9.929920e-06,0.000006,4.974654e-06
...,...,...,...,...,...,...
39772,1998.500000,2.071736e-06,0.000000,6.839992e-07,0.000002,1.208086e-06
39773,1998.550049,8.101359e-07,0.000002,1.259683e-06,0.000002,1.536073e-06
39774,1998.599976,1.508652e-06,0.000002,1.818310e-06,0.000005,2.321898e-06
39775,1998.650024,1.620275e-06,0.000004,6.478957e-07,0.000002,2.002568e-06


In [7]:
# We can merge dataframes to get everything together in one place
df_merged = pd.concat([df_lime, df_lemon], axis=1)

# and print these to see what this looks like
df_merged

Unnamed: 0,mz_lime,Lime1,Lime2,Lime3,Lime4,Lime5,mz_lemon,Lemon1,Lemon2,Lemon3,Lemon4,Lemon5
0,9.900000,3.566247e-06,8.163455e-06,1.801872e-06,0.000009,4.561305e-06,9.900000,9.386688e-06,0.000005,1.040731e-05,0.000006,6.562673e-06
1,9.950000,3.566247e-06,8.163455e-06,1.801872e-06,0.000009,4.561305e-06,9.950000,9.386688e-06,0.000005,1.040731e-05,0.000006,6.562673e-06
2,10.000000,4.353816e-06,4.522634e-06,8.955635e-07,0.000007,2.648715e-06,10.000000,8.429838e-06,0.000007,1.272107e-05,0.000008,4.316771e-06
3,10.050000,3.822895e-06,4.290223e-06,4.590969e-06,0.000007,2.151427e-06,10.050000,9.633747e-06,0.000008,1.114547e-05,0.000007,5.319957e-06
4,10.100000,3.455640e-06,5.472827e-06,2.643489e-06,0.000007,6.301590e-07,10.100000,6.962591e-06,0.000005,9.929920e-06,0.000006,4.974654e-06
...,...,...,...,...,...,...,...,...,...,...,...,...
39772,1998.500000,8.229950e-07,1.585012e-06,1.003453e-06,0.000002,0.000000e+00,1998.500000,2.071736e-06,0.000000,6.839992e-07,0.000002,1.208086e-06
39773,1998.550049,7.566306e-07,2.145112e-06,1.219228e-06,0.000001,1.063408e-06,1998.550049,8.101359e-07,0.000002,1.259683e-06,0.000002,1.536073e-06
39774,1998.599976,3.157003e-06,6.555599e-07,1.774903e-06,0.000002,2.473911e-06,1998.599976,1.508652e-06,0.000002,1.818310e-06,0.000005,2.321898e-06
39775,1998.650024,0.000000e+00,7.302742e-07,1.435006e-06,0.000003,0.000000e+00,1998.650024,1.620275e-06,0.000004,6.478957e-07,0.000002,2.002568e-06


In [8]:
# I can remove the second mz column (mz_lemon) because it is the same as the first (mz_lime)
del df_merged['mz_lemon']

# Let's also delete the mz column
del df_merged['mz_lime']

df_merged

Unnamed: 0,Lime1,Lime2,Lime3,Lime4,Lime5,Lemon1,Lemon2,Lemon3,Lemon4,Lemon5
0,3.566247e-06,8.163455e-06,1.801872e-06,0.000009,4.561305e-06,9.386688e-06,0.000005,1.040731e-05,0.000006,6.562673e-06
1,3.566247e-06,8.163455e-06,1.801872e-06,0.000009,4.561305e-06,9.386688e-06,0.000005,1.040731e-05,0.000006,6.562673e-06
2,4.353816e-06,4.522634e-06,8.955635e-07,0.000007,2.648715e-06,8.429838e-06,0.000007,1.272107e-05,0.000008,4.316771e-06
3,3.822895e-06,4.290223e-06,4.590969e-06,0.000007,2.151427e-06,9.633747e-06,0.000008,1.114547e-05,0.000007,5.319957e-06
4,3.455640e-06,5.472827e-06,2.643489e-06,0.000007,6.301590e-07,6.962591e-06,0.000005,9.929920e-06,0.000006,4.974654e-06
...,...,...,...,...,...,...,...,...,...,...
39772,8.229950e-07,1.585012e-06,1.003453e-06,0.000002,0.000000e+00,2.071736e-06,0.000000,6.839992e-07,0.000002,1.208086e-06
39773,7.566306e-07,2.145112e-06,1.219228e-06,0.000001,1.063408e-06,8.101359e-07,0.000002,1.259683e-06,0.000002,1.536073e-06
39774,3.157003e-06,6.555599e-07,1.774903e-06,0.000002,2.473911e-06,1.508652e-06,0.000002,1.818310e-06,0.000005,2.321898e-06
39775,0.000000e+00,7.302742e-07,1.435006e-06,0.000003,0.000000e+00,1.620275e-06,0.000004,6.478957e-07,0.000002,2.002568e-06
