In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import seaborn
import glob
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

## Read datasets into pandas dataframes

We have these in our open_work

In [None]:
# Identify files
datasets_folder = 'C:/Projects/brainspin/not_pushed/data_anonymized/assembled'
dataset_files = glob.glob(os.path.join(datasets_folder, '*.csv'))
print(dataset_files)

In [None]:
config = Config.from_file()
root_mri_directory = config.get_directory('raw_data')

In [None]:
my_local = os.path.join(root_mri_directory,'assembled')
my_local

In [None]:
# Read files into dataframes
TOP = pd.read_csv(my_local+'/top_stitched.csv')
StrokeMRI = pd.read_csv(my_local+'/StrokeMRI_stitched.csv')
Insight46 = pd.read_csv(my_local+'/Insight46_stitched.csv')
df_list = [TOP, StrokeMRI, Insight46]


In [None]:
TOP = TOP.drop(0, axis=0)
StrokeMRI = StrokeMRI.drop(0, axis=0)
Insight46 = Insight46.drop(0, axis=0)


In [None]:
Insight46_described = Insight46.describe()
Insight46_described

In [None]:
StrokeMRI_described = StrokeMRI.describe()
StrokeMRI_described

TOP was produced in a way that made more data non-numeric do extra step:

In [None]:
TOP =  TOP.apply (pd.to_numeric, errors='coerce')
TOP_described =TOP.describe()
TOP_described

In [None]:
TOP

So datasets are very incomparable by age. We must compare similar age groups. Let's see if we can break the datasets down by age group?

In [None]:
# need to be updated to normalized new standard dataset

In [None]:
top_grouped_sex_count = TOP.groupby(['Sex']).count()
top_grouped_sex_mean = TOP.groupby(['Sex'])['GM_vol'].mean()
#df.groupby(["state", "gender"])["last_name"].count()
top_grouped_sex_count

That's odd, I though men had bigger brains by volume. Let's see if an age split explains this

In [None]:
top_grouped_sex_mean_age = TOP.groupby(['Sex'])['Age'].mean()
#df.groupby(["state", "gender"])["last_name"].count()
top_grouped_sex_mean_age

So if women are 1s then they have bigger brains...given that the ages are close. 

1. split out the age groups and compare averages, std, distribution etc. on all parameters
-----------------------------------------------------------------------------
0. meet to make sure dataframes are in fact correct
1a. compare TOP and StrokeMRI only? -> gives us a difference as allowable? gives us baseline on same measurements
1b. do a polynomial fits on the error with age (per variability)
1c. investigate brain age gap on this (it should be zero)



1z. compare Insight46-> every other group's 70+/1 year
question of whether Inisght46 people are not truly all healthy



by the way, let's look a little deeper on our data

In [None]:
from sklearn.decomposition import PCA



In [None]:
top_skinny = TOP.iloc[:, 5:]
df_to_check = top_skinny.dropna()
df_to_check

In [None]:
precent_of_variance_explained = .95

pca = PCA(n_components=precent_of_variance_explained)

pca_data = pca.fit_transform(X)

In [None]:


# You must normalize the data before applying the fit method

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_to_check)
pca = PCA(n_components= df_to_check.shape[1])
pca.fit(df_to_check)

# Reformat and view results
loadings = pd.DataFrame(pca.components_.T,
columns=['PC%s' % _ for _ in range(len(df_to_check.columns))],
index=df_to_check.columns)
print(loadings)

plt.plot(pca.explained_variance_ratio_)
plt.ylabel('Explained Variance')
plt.xlabel('Components')
plt.show()

## OK, so let's note we have a 94 column dataset , but only about 5 columns are really independant variables by this analysis. 
let's take a closer look at the first 10

In [None]:
df_to_check.columns[:10]

In [None]:
first_akk = df_to_check[df_to_check.columns[:-2]]
X = first_akk
X.columns

In [None]:
sex = df_to_check[['Sex']]
y = sex

In [None]:
df = pd.DataFrame(X)
df['Label']=y
##df['Species']=df['Label'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

df.head()

In [None]:
X.shape

In [None]:
#Transform features
pca = PCA()
X_pca = pca.fit_transform(X)
X_pca.shape

In [None]:
#Merge with df
pca_df = pd.DataFrame(X_pca,columns=['PC%s' % _ for _ in range(X.shape[1])])
df = pd.merge(df, pca_df, right_index=True, left_index=True)

In [None]:
print('Explained Variance Ratio')
for i in range(10):
    print('PC{}: {}'.format(i+1,pca.explained_variance_ratio_[i]))

In [None]:
seaborn.stripplot(x="PC1", y="Label", data=df,jitter=True)
plt.title( 'Data Visualized in One Dimension');

In [None]:
df

In [None]:
seaborn.lmplot(data=df,x= 'WM_vol', y='GM_vol', hue='Label',fit_reg=False)
plt.title('Data Visualized in Two Dimensions');

In [None]:
seaborn.lmplot(data=df,x= 'GM_ICVRatio', y='GMWM_ICVRatio', hue='Label',fit_reg=False)
plt.title('Data Visualized in Two Dimensions');

In [None]:
seaborn.lmplot(data=df,x= 'WMH_count', y='WMH_vol', hue='Label',fit_reg=False)
plt.title('Data Visualized in Two Dimensions');

In [None]:
percent_of_variance_explained = .99

pca = PCA(n_components=percent_of_variance_explained)

pca_data = pca.fit_transform(X)

print("{} Principal Components are required to explain {} of the variation in this data.".format(pca.n_components_,percent_of_variance_explained))



In [None]:

scaler.fit(first_10)
pca = PCA(n_components= 4)
pca.fit(first_10)

# Reformat and view results
loadings = pd.DataFrame(pca.components_.T,
columns=['PC%s' % _ for _ in range(4)],
index=first_10.columns)
print(loadings)

plt.plot(pca.explained_variance_ratio_)
plt.ylabel('Explained Variance')
plt.xlabel('Components')
plt.show()

In [None]:
map= pd.DataFrame(pca.components_,columns=first_10.columns)
plt.figure(figsize=(12,6))
seaborn.heatmap(map,cmap='rocket')

# Anyways

In [None]:
# TOP_tiny = TOP[['GM_vol', 'WM_vol','Age']]
# TOP_tiny

In [None]:
# Stroke_tiny = StrokeMRI[['GM_vol', 'WM_vol','Age']]
# Stroke_tiny

In [None]:
# plt.scatter(Stroke_tiny['Age'],Stroke_tiny['GM_vol'], color='purple')
# plt.scatter(TOP['Age'],TOP['GM_vol'])

In [None]:

def relate_columns_graphs_two_dfs(dataframe1, dataframe2, special_column_name, other_column_name):
    """ This function makes a scatter plot of all columns

    :param dataframe1: dataframe variable
    :type dataframe1: pandas.dataFrame
    :param dataframe2: dataframe variable
    :type dataframe2: pandas.dataFrame
    :param special_column_name: string of column you want to graph against
    :type  special_column_name: str

    :returns: no return, makes artifact
    :rtype: None.
    """
    shared_columns = (dataframe1.columns.intersection(dataframe2.columns)).to_list()
    
    dataframe1 = dataframe1[shared_columns]
    dataframe2 = dataframe2[shared_columns]
#     print(dataframe1)
    plt.scatter(dataframe1[special_column_name],dataframe1[other_column_name], color='purple', alpha=0.5)
    plt.scatter(dataframe2[special_column_name],dataframe2[other_column_name], color = 'orange',alpha=0.5)
    plt.xlabel(special_column_name)
    plt.ylabel(other_column_name)
    #plt.show( block=False )
    plt.savefig((other_column_name +"versus" + special_column_name + ".png"))
    plt.show( block=False )

In [None]:
def plot_2on2_df(dataframe1, dataframe2, special_column):
    shared_columns = (dataframe1.columns.intersection(dataframe2.columns)).to_list()
    for rotator_column in dataframe1[shared_columns]:
        relate_columns_graphs_two_dfs(dataframe1, dataframe2,special_column, rotator_column)

In [None]:
plot_2on2_df(TOP,StrokeMRI, 'Age')

In [None]:
shared_columns = (TOP.columns.intersection(StrokeMRI.columns)).to_list()
for rotator_column in TOP[shared_columns]:
    relate_columns_graphs_two_dfs(TOP, StrokeMRI,'Age', rotator_column)

In [None]:
TOP.shape

In [None]:
StrokeMRI.shape

In [None]:
a = TOP.columns.intersection(StrokeMRI.columns)
a