# Sample ML notebook

Notes: this notebook is essentially a notebook to give scientists a flavor of what is possible in terms 
    of interacting with tabular data for ML. Results are not final.

## import libraries needed

In [1]:
#basic ds
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#basic system
import sys
import os
import glob


# math and signals
import math
from scipy.stats import entropy
from scipy.signal import savgol_filter
from scipy.signal import find_peaks
# demo stuff
import ipywidgets as widgets
import seaborn 

KeyboardInterrupt: 

In [None]:
# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
import joblib

## import data

In [None]:
our_data_now = 'internal_results/top_stitched.csv'
our_data = pd.read_csv(our_data_now)
our_data

Clean Nans out of data

In [None]:
our_data = our_data.dropna()

## pick and visualize correlations

Here you will use the 'CtrL' button on most machines, and select multiple possible columns (which may become features for machine learning) to visualize in terms of correlations

In [None]:
features = widgets.SelectMultiple(
    options=our_data.columns.tolist(),
    value=['GM_vol'],
    #rows=10,
    description='Features',
    disabled=False
)
features

In [None]:
features_list = list(features.value)

features_list

Next you will choose your label

In [None]:
label = widgets.Dropdown(
    options= our_data.columns.tolist(),# our_data.columns.tolist(),
    value='Age',
    #rows=10,
    description='label',
    disabled=False
)
label

In [None]:
label_column = label.value 

In [None]:
x_column = [label.value]
full_matrix = features_list + x_column

In [None]:
our_data[full_matrix][1:]

Note: pandas will default correlation method to ='pearson'. Needs discussion with scientsts if other correlation is better. Types kendall and spearman are avalable.

In [None]:
%matplotlib inline
seaborn.heatmap(our_data[full_matrix][1:].corr(), annot = True)

Now you may want to graph a relationship, to see if there are hints that a weighted correlation is interesting

In [None]:
element_a = widgets.Dropdown(
    options= our_data.columns.tolist(),# our_data.columns.tolist(),
    value='Age',
    #rows=10,
    description='x-axis',
    disabled=False
)
element_a

In [None]:
element_b = widgets.Dropdown(
    options= our_data.columns.tolist(),# our_data.columns.tolist(),
    value='Age',
    #rows=10,
    description='y-axis',
    disabled=False
)
element_b

In [None]:
#our_data[element_b.value]

In [None]:
our_data[element_a.value]= our_data[element_a.value].apply(pd.to_numeric)
our_data[element_b.value]= our_data[element_b.value].apply(pd.to_numeric)
plt.scatter(our_data[element_a.value],our_data[element_b.value])
plt.show()

In [None]:
ml_matrix = our_data[full_matrix][1:]

In [None]:
#ml_matrix

Here we will assume you want to build a model on age, but this could also be a menu

In [None]:
X = ml_matrix.drop('Age', axis =1)
X = X.values
X = X.astype('float')

In [None]:
y = ml_matrix['Age'].values
y=y.astype('int')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
# scale
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Here we should add a drop down to pick your algorithm, 
Next version

In [None]:
logr = LogisticRegression(solver='liblinear', random_state=0)
logr.fit(X_train, y_train)

In [None]:
y_pred = logr.predict(X_test)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Not exactly shocking that logistic regression is low... we should use a linear regression. 

In [None]:
#ml_matrix

In [None]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [None]:
y_pred = linr.predict(X_test)

In [None]:

print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))

Interesting let's try a linear regression on GM_ICV alone, to compare how much adding these other variables helped if at all? 
Also we will do a drop down menu to pick models, then run them...

But then we need to figure out if the user wants to save the model

In [None]:
saving =widgets.ToggleButton(
    value=False,
    description='Click me to save model',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Description',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)

saving

In [None]:
if saving.value:
    print('You need to name your file, then hit enter')
    file_given_name = input()
    

    

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
# save off file
joblib.dump(linr, ('../result_models/'+file_given_name+ '.sav'))
    

In [None]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_test)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Decision tree did really poorly...not a surprise, it's not a good model for a continous variable. We can also examine in which direction it went wrong a bit with some fancy graphing...if scientists request. 

In [None]:

llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train, y_train)

In [None]:
y_pred = llreg.predict(X_test)
print('R2 score Linear regression: %.3f' % llreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))

In [None]:

sgdr = SGDRegressor()
sgdr.fit(X_train, y_train)

In [None]:
y_pred = sgdr.predict(X_test)

In [None]:

print('R2 score SGDR regression: %.3f' % sgdr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))

# Conclusions:

1. Notebook needs work with scientists for most convenient way to run experiments.
2. linear and stochastic gradient descent regressors work best for a model based on ratios and white matter
3. decision tree and logisic regression work poorly FOR AGE as one would predict...could be promising for sex (the code is there for that but not run)
4. we probably want to try a Huber regressor

In [None]:
len(our_data)