In [21]:
#core libraries needed
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

#scaling the data
from sklearn.preprocessing import StandardScaler

#different ways of splitting the data
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

#different classifiers that we can (feel free to add to this list) use
from sklearn.svm import SVC #support vector machine
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

#metrics from how well your classification did
from sklearn.metrics import precision_score, recall_score

In [42]:
#load the data
wd = '/home/brian/Documents/data/Classifier/developmental/'

#loop through the different ages and concatenate them all together
for i in np.arange(1,15):
    path = os.path.join(wd, 'P{0}_IC_metrics.tsv'.format(str(i)))
    if i == 1:
        main_data = pd.read_csv(path,sep='\t', index_col='exp_ic')
        last_animal_num = np.unique(main_data['anml'])[-1] #find the last animal number used
    else:
        data = pd.read_csv(path,sep='\t', index_col='exp_ic')
        data['anml']+=last_animal_num #adjust the animal number for each new dataset
        main_data = main_data.append(data)
        last_animal_num = np.unique(main_data['anml'])[-1]
        
print('We have this many rows of data: ', len(main_data))
print('We have these ages in the dataframe: ', np.unique(main_data['age']))
print('We have a total numer of animals: ', np.unique(main_data['anml'])[-1])

We have this many rows of data:  22085
We have these ages in the dataframe:  [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14.]
We have a total numer of animals:  37.0


In [None]:
#pull out the classification
labels = main_data['artifact'].copy() #THESE ARE YOUR Y VALUES

In [43]:
#Prepare your X values/features you want to train your data on

#drop signal/artifact columns
main_data = main_data.drop(['signal', 'artifact'], axis=1)
#fill in the nans
main_data = main_data.fillna(0)
#check to see if there are any NaNs and signal is not present
np.sum(main_data==np.nan)

age                    0
anml                   0
freq.avgsnr            0
freq.integrate         0
freq.maxsnr            0
freq.maxsnr.freq       0
freq.range.high        0
freq.range.low         0
freq.rangesz           0
length                 0
mass.perc              0
mass.region            0
mass.total             0
region.centroid.0      0
region.centroid.1      0
region.eccentricity    0
region.extent          0
region.majaxis         0
region.majmin.ratio    0
region.minaxis         0
region.orient          0
spatial.COMall.x       0
spatial.COMall.y       0
spatial.COMdom.x       0
spatial.COMdom.y       0
spatial.avg            0
spatial.max            0
spatial.min            0
spatial.n.domains      0
spatial.std            0
temporal.autocorr      0
temporal.max           0
temporal.min           0
temporal.n.freq        0
temporal.std           0
threshold.area         0
threshold.perc         0
dtype: int64

In [30]:
#split between those that have a thresholded domain and those that do not
domain = main_data.loc[main_data['threshold.area'] != 0].copy()
nodomain = main_data.loc[main_data['threshold.area'] == 0].copy()

print(domain.head())
print(nodomain.head())

                      age  anml  freq.avgsnr  freq.integrate  freq.maxsnr  \
exp_ic                                                                      
170725_02-03-04-0000  1.0   1.0     1.765045        0.710622     2.516710   
170725_02-03-04-0001  1.0   1.0     1.554507        0.337713     2.216408   
170725_02-03-04-0002  1.0   1.0     1.463165        0.060340     1.762951   
170725_02-03-04-0003  1.0   1.0     1.798323        0.469073     2.679099   
170725_02-03-04-0004  1.0   1.0     1.454975        0.154082     1.891668   

                      freq.maxsnr.freq  freq.range.high  freq.range.low  \
exp_ic                                                                    
170725_02-03-04-0000          2.292510         1.363135        0.312500   
170725_02-03-04-0001          2.500000         1.146255        0.202631   
170725_02-03-04-0002          2.102241         2.292510        0.481941   
170725_02-03-04-0003          2.292510         1.250000        0.220971   
170725_02-

In [31]:
#scale your data
scaler = StandardScaler()
scaler.fit(domain.values)
domain[:] = scaler.transform(domain.values)
nodomain[:] = scaler.transform(nodomain.values)
print(domain.head())
print(nodomain.head())

                           age      anml  freq.avgsnr  freq.integrate  \
exp_ic                                                                  
170725_02-03-04-0000 -1.695142 -0.809464     0.327756       -0.471723   
170725_02-03-04-0001 -1.695142 -0.809464    -0.100879       -0.929913   
170725_02-03-04-0002 -1.695142 -0.809464    -0.286841       -1.270718   
170725_02-03-04-0003 -1.695142 -0.809464     0.395507       -0.768513   
170725_02-03-04-0004 -1.695142 -0.809464    -0.303515       -1.155538   

                      freq.maxsnr  freq.maxsnr.freq  freq.range.high  \
exp_ic                                                                 
170725_02-03-04-0000     0.348458          0.426496        -0.082504   
170725_02-03-04-0001     0.052631          0.627304        -0.343953   
170725_02-03-04-0002    -0.394068          0.242354         1.037863   
170725_02-03-04-0003     0.508426          0.426496        -0.218888   
170725_02-03-04-0004    -0.267269          0.242354     

In [34]:
print(domain.columns) # full list of features

#define which of the features you want to use
#make a list of all the column headers/features you want to include
# in the ML training.  These may be distinct for each subset of the 
# data
domain_features = ['feature1','feature2']
nodomain_features = ['feature3','feature4']

#this will keep ALL features. 
#once you define the features, comment this out or delete it.
domain_features = domain.columns
nodomain_features = nodomain.columns



Index(['age', 'anml', 'freq.avgsnr', 'freq.integrate', 'freq.maxsnr',
       'freq.maxsnr.freq', 'freq.range.high', 'freq.range.low', 'freq.rangesz',
       'length', 'mass.perc', 'mass.region', 'mass.total', 'region.centroid.0',
       'region.centroid.1', 'region.eccentricity', 'region.extent',
       'region.majaxis', 'region.majmin.ratio', 'region.minaxis',
       'region.orient', 'spatial.COMall.x', 'spatial.COMall.y',
       'spatial.COMdom.x', 'spatial.COMdom.y', 'spatial.avg', 'spatial.max',
       'spatial.min', 'spatial.n.domains', 'spatial.std', 'temporal.autocorr',
       'temporal.max', 'temporal.min', 'temporal.n.freq', 'temporal.std',
       'threshold.area', 'threshold.perc'],
      dtype='object')


In [44]:
# split your data to test and training datasets 

def splitData(dataFrame, #X values
              labels, #Y values
              n_splits=10, # the number of times you shuffle and split your data.
              test_size=0.30): # The size we want our testing sample (in this case 30% of the total data)
        
        sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)#random_state=42)
        for train_index, test_index in sss.split(dataFrame, labels):
            X_train, X_test = dataFrame.iloc[train_index], dataFrame.iloc[test_index]
            y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

        return X_train, X_test, y_train, y_test
    
#use a stratifiedshufflesplit to split your data
X_train, X_test, y_train, y_test = splitData(domain.loc[:,domain_features].copy(), labels.loc[domain.index])
X2_train, X2_test, y2_train, y2_test = splitData(nodomain.loc[:,nodomain_features].copy(),labels.loc[nodomain.index])
