In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import seaborn
import glob
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

## Read datasets into pandas dataframes

We have these in our open_work

In [None]:
# Identify files
datasets_folder = 'C:/Projects/brainspin/not_pushed/data_anonymized/assembled'
dataset_files = glob.glob(os.path.join(datasets_folder, '*.csv'))
print(dataset_files)

In [None]:
!pwd

In [None]:
# Read files into dataframes
TOP = pd.read_csv('C:/Projects/brainspin/not_pushed/data_anonymized/assembled\\top_stitched.csv')
StrokeMRI = pd.read_csv('C:/Projects/brainspin/not_pushed/data_anonymized/assembled\\StrokeMRI_stitched.csv')
#Insight46 = pd.read_csv('../open_work/internal_results\\Insight46_stitched.csv')
df_list = [TOP, StrokeMRI] #, Insight46]


In [None]:
TOP = TOP.drop(0, axis=0)
StrokeMRI = StrokeMRI.drop(0, axis=0)


In [None]:
StrokeMRI_described = StrokeMRI.describe()
StrokeMRI_described

TOP was produced in a way that made more data non-numeric do extra step:

In [None]:
TOP =  TOP.apply (pd.to_numeric, errors='coerce')
TOP_described =TOP.describe()
TOP_described

In [None]:
common_columns = TOP.columns.intersection(StrokeMRI.columns).to_list()

## Now we blend the two sets

In [None]:
super_set = pd.concat((TOP[common_columns],StrokeMRI[common_columns]))
super_set

In [None]:
idx = len(super_set) - 1 if len(super_set) % 2 else len(super_set)

In [None]:
super_set_sorted = super_set.sort_values('Age')#apply(lambda x: x.interpolate())
#print(len(super_set_sorted))
super_set_f= pd.DataFrame(super_set_sorted.drop('session', axis=1))
super_set_f = super_set_f.dropna()
super_set_f

In [None]:
super_set_f = super_set_f.reset_index()
super_set_f

In [None]:

new_set = super_set_f[:idx].groupby(super_set_f.index[:idx] // 2).mean()


So we took our two datasets and blended by simply mixing close values i.e. every adjacent on one side row in a cobined dataset, when sorted by age. Now we can add a bit of noise just for good measure. 

In [None]:
mu, sigma = 0, 0.01 
# creating a noise with the same dimension as the dataset 
noise = np.random.normal(mu, sigma, [527,25]) 
print(noise)

Add the two

In [None]:
mixed_and_noise = noise+ new_set
mixed_and_noise 

## OK, so let's look at this in terms of the way it graphs

In [None]:
sep.relate_columns_graphs(mixed_and_noise ,'Age')

Everything looks fine except sex, we need to recode that as sex will be for us binary, and longitudinal time point, which should be an int

In [None]:
mixed_and_noise= mixed_and_noise.drop('index', axis=1)
# as we added noise, we can just round and denoise
mixed_and_noise['Sex'] = mixed_and_noise['Sex'].round()
mixed_and_noise['LongitudinalTimePoint'] = mixed_and_noise['LongitudinalTimePoint'].round()
mixed_and_noise

In [None]:
## We are done. This gives us a synthetic/augmented dataset based upon our existing ones. Pending approval

In [None]:
mixed_and_noise.to_csv('C:/Projects/brainspin/not_pushed/synthetic_set.csv')

In [None]:

def relate_columns_graphs_two_dfs(
        dataframe1,
        dataframe2,
        special_column_name,
        other_column_name,
        color1='purple',
        color2='orange',
):

    """
    This function is meant to be a helper function
    for one that makes a scatter plot of all columns
    that two dataframes have in common
    :param dataframe1: dataframe variable
    :type dataframe1: pandas.dataFrame
    :param dataframe2: dataframe variable
    :type dataframe2: pandas.dataFrame
    :param special_column_name: str of column you graph against
    :type  special_column_name: str
    :param other_column_name: string of column you want to graph
    :type  other_column_name: str
    :returns: no return, makes artifact
    :rtype: None.
    """
    shared_columns = (
        dataframe1.columns.intersection(dataframe2.columns)).to_list()

    dataframe1 = dataframe1[shared_columns]
    dataframe2 = dataframe2[shared_columns]
    plt.scatter(
        dataframe1[special_column_name],
        dataframe1[other_column_name],
        color=color1,
        alpha=0.4,
    )
    plt.scatter(
        dataframe2[special_column_name],
        dataframe2[other_column_name],
        color=color2,
        alpha=0.4,
    )
    plt.xlabel(special_column_name)
    plt.ylabel(other_column_name)
    plt.savefig((other_column_name + "versus" + special_column_name + ".png"))
    plt.show(block=False)


def plot_2on2_df(dataframe1,
                 dataframe2,
                 special_column,
                 color1='purple',
                 color2='orange',):
    """
    This function is meant to create an artifact
    of two datasets with comparable variables
    in terms of graphing the variables
    against a variable of interest
    :param dataframe1: dataframe variable
    :type dataframe1: pandas.dataFrame
    :param dataframe2: dataframe variable
    :type dataframe2: pandas.dataFrame
    :param special_column_name: string of column you want to graph against
    :type  special_column_name: str
    :returns: no return, makes artifact
    :rtype: None.
    """
    shared_columns = (
        dataframe1.columns.intersection(dataframe2.columns)).to_list()
    for rotator_column in dataframe1[shared_columns]:
        relate_columns_graphs_two_dfs(
            dataframe1,
            dataframe2,
            special_column,
            rotator_column,
            color1=color1,
            color2=color2,
        )

In [None]:
plot_2on2_df(mixed_and_noise, TOP, 'Age', color1='red')

Now let's do some polynomial fitting based on this dataset.

In [None]:
# Voorbeeldpunten
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 3, 5, 8, 13])

# Graad van de polynoom
degree = 2

# Beste polynoom passen
coefficients = np.polyfit(x, y, degree)

# Coëfficiënten afdrukken
print("Coëfficiënten:", coefficients)

In [None]:
def polyfit_and_show(
        dataframe,
        special_column_name,
        other_column_name,
        color1='purple',
        color2='orange',
):
    x = np.array(dataframe[special_column_name])
    y = np.array(dataframe[other_column_name])
    degree = 2
    coefficients = np.polyfit(x, y, degree)
    print("Coëfficiënten 2nd degree polynomial:", coefficients)
    tup = (x.min(), x.max())
    line_z = []
    for a in tup:
        z = coefficients[0]*(a*a) + coefficients[1]*a + coefficients[2]
        line_z.append(z)
    plt.plot(tup, line_z)
    plt.scatter(
        dataframe[special_column_name],
        dataframe[other_column_name],
        color=color1,
        alpha=0.4,
    )
        
    return coefficients
    
    

In [None]:
polyfit_and_show(
        mixed_and_noise,
        'Age',
        'GM_vol',

)

In [None]:
4^5