In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import functools
import itertools
import fit_glm_helpers as fgh
import sglm.data_helpers


# Import Data

Inputted CSV data should conform to the following convention:

Indices / Unique Row Identifiers:
* File ID -- Any order is acceptable
* Trial ID -- Must be in chronological order, but does not need to start from zero
* TimeStamp ID -- Must be in chronological order, but does not need to start from zero

Columns (Predictors + Responses + Miscelaneous Information):
* Predictors
* Reponses
* Miscelaneous Information

Example:
| id_session | id_trial | id_timestamp | predictor_1 | predictor_2 | predictor_3 | response_1 | response_2 |
| --- | --- | --- | --- | --- | --- | --- | --- |
| session_0 | trial_0 | -1 | 0 | 0 | 0 | 1 | 0.3 |
| session_0 | trial_0 | 0 | 0 | 0 | 0 | 0 | 1.4 |
| session_0 | trial_0 | 1 | 0 | 0 | 0 | 1 | 2.3 |
| session_0 | trial_0 | 2 | 0 | 1 | 0 | 1 | 0.3 |
| session_0 | trial_1 | -2 | 0 | 0 | 0 | 0 | 1.4 |
| session_0 | trial_1 | -1 | 0 | 0 | 0 | 1 | 2.3 |
| session_0 | trial_1 | 0 | 1 | 0 | 0 | 0 | 1.4 |
| session_0 | trial_1 | 1 | 0 | 0 | 0 | 1 | 2.3 |
| session_1 | trial_0 | 5 | 0 | 0 | 0 | 0 | 1.4 |
| session_1 | trial_0 | 6 | 1 | 0 | 0 | 1 | 2.3 |
| session_1 | trial_0 | 7 | 0 | 0 | 0 | 0 | 1.4 |
| session_1 | trial_0 | 8 | 0 | 0 | 0 | 1 | 2.3 |
| session_1 | trial_1 | 9 | 0 | 0 | 0 | 0 | 1.4 |
| session_1 | trial_1 | 10 | 0 | 0 | 0 | 1 | 2.3 |
| session_1 | trial_1 | 11 | 0 | 0 | 1 | 0 | 1.4 |
| session_1 | trial_1 | 12 | 0 | 0 | 0 | 0 | 2.3 |
| session_1 | trial_2 | 13 | 0 | 0 | 0 | 0 | 1.4 |
| session_1 | trial_2 | 14 | 0 | 0 | 0 | 0 | 2.3 |
| session_1 | trial_2 | 15 | 0 | 1 | 0 | 0 | 1.4 |
| session_1 | trial_2 | 16 | 0 | 0 | 0 | 1 | 2.3 |

# Define or Load Parameters Dictionary

In [18]:
df_source_raw = pd.read_csv('/Users/josh/Desktop/example_output_folder/df_signal_all.csv',
                        index_col=['session_id', 'nTrial', 'nEndTrial', 'timestamp']) # Uncomment to Use Your Data

In [19]:
parameters_glm_fit = {
    'predictors_toRename': {
        'photometryCenterInIndex': 'CI',
        'photometryCenterOutIndex': 'CO',
        'photometrySideInIndex': 'SI',
        'photometrySideOutIndex': 'SO',
    },
    'predictors_toUnroll': {
        'photometryCenterInIndex=hasAllPhotometryData': 'CI=hasData',
        'photometryCenterOutIndex=hasAllPhotometryData': 'CO=hasData',
        'photometrySideInIndex=hasAllPhotometryData': 'SI=hasData',
        'photometrySideOutIndex=hasAllPhotometryData': 'SO=hasData',

        'photometryCenterInIndex=wasRewarded': 'CI=rew',
        'photometryCenterOutIndex=wasRewarded': 'CO=rew',
        'photometrySideInIndex=wasRewarded': 'SI=rew',
        'photometrySideOutIndex=wasRewarded': 'SO=rew',

        'photometryCenterInIndex=word': 'CI=wd',
        'photometryCenterOutIndex=word': 'CO=wd',
        'photometrySideInIndex=word': 'SI=wd',
        'photometrySideOutIndex=word': 'SO=wd',
    },
    'predictors': [
        'photometryCenterInIndex', 'CI=rew', 'CI=wd', # 'CI=hasData',
        'photometryCenterOutIndex', 'CO=rew', 'CO=wd', # 'CO=hasData',
        'photometrySideInIndex', 'SI=rew', 'SI=wd', # 'SI=hasData',
        'photometrySideOutIndex', 'SO=rew', 'SO=wd', # 'SO=hasData',
    ],
    'predictors_shift_bounds_default': (-20, 40), # Default Shift Bounds for All Predictors Unspecified in 'predictors_shift_bounds'
    'predictors_shift_bounds': { # 
        # 'predictor_1': (-2, 2),
        # 'predictor_2': (-2, 0),
    },
    'response': 'gDA',
    'glm_keyword_arguments': {
        'model_name': 'Normal',
        'C': 1e-6,
        'solver': 'lbfgs',
        'alpha': 0,
        'l1_ratio': 0,
        'max_iter': 10000,
    }
}

# TODO: JZ: Add Importing for Parameters

# BERNARDO
# Add a column for trues / falses
# Add the option to shift forward/backward the true/false


In [20]:
df_source = df_source_raw.rename(parameters_glm_fit['predictors_toRename'], axis=1)

lst_df_postUnroll = []
for columnName_preunroll, columnName_postunroll in parameters_glm_fit['predictors_toUnroll'].items():
    lst_df_postUnroll.append(pd.get_dummies(df_source[columnName_preunroll], prefix=columnName_postunroll))

df_source = pd.concat([df_source[parameters_glm_fit['predictors_toRename'].values()].fillna(0)] + lst_df_postUnroll, axis=1)

In [22]:
with pd.option_context('display.max_columns', 1000):
    display(df_source)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CI,CO,SI,SO,CI=hasData_1.0,CI=rew_0.0,CI=rew_1.0,CI=wd_AA,CI=wd_AB,CI=wd_Aa,CI=wd_Ab,CI=wd_aA,CI=wd_aB,CI=wd_aa,CI=wd_ab,CO=hasData_1.0,CO=rew_0.0,CO=rew_1.0,CO=wd_AA,CO=wd_AB,CO=wd_Aa,CO=wd_Ab,CO=wd_aA,CO=wd_aB,CO=wd_aa,CO=wd_ab,SI=hasData_1.0,SI=rew_0.0,SI=rew_1.0,SI=wd_AA,SI=wd_AB,SI=wd_Aa,SI=wd_Ab,SI=wd_aA,SI=wd_aB,SI=wd_aa,SI=wd_ab,SO=hasData_1.0,SO=rew_0.0,SO=rew_1.0,SO=wd_AA,SO=wd_AB,SO=wd_Aa,SO=wd_Ab,SO=wd_aA,SO=wd_aB,SO=wd_aa,SO=wd_ab
session_id,nTrial,nEndTrial,timestamp,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
WT63_11102021,,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
WT63_11102021,,0.0,1,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
WT63_11102021,,0.0,2,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
WT63_11102021,,0.0,3,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
WT63_11102021,,0.0,4,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WT59_10152021,343.0,,44438,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
WT59_10152021,343.0,,44439,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
WT59_10152021,343.0,,44440,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
WT59_10152021,343.0,,44441,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
list_predictors_and_shifts = [(predictor,parameters_glm_fit['predictors_shift_bounds'].get(predictor, parameters_glm_fit['predictors_shift_bounds_default'])) for predictor in parameters_glm_fit['predictors']]
list_predictors_shifted = []
for predictor, predictor_shift_bounds in list_predictors_and_shifts:
    predictor_shifted = sglm.data_helpers.shift_series_range(
        df_source[predictor],
        predictor_shift_bounds,
        shift_bounding_column=['session_id']
    )
    list_predictors_shifted.append(predictor_shifted)
df_predictors_shifted = pd.concat(list_predictors_shifted, axis=1)
srs_response = df_source[parameters_glm_fit['response']]

In [None]:
non_nans = (df_predictors_shifted.isna().sum(axis=1) == 0)&~np.isnan(srs_response)
df_predictors_fit = df_predictors_shifted[non_nans].copy()
srs_response_fit = srs_response[non_nans].copy()

In [None]:
# TODO: JZ: Implement Train/Validation/Test Splitting

# Fit GLM Model

In [None]:
import sglm.fit_helpers

glm_model = sglm.fit_helpers.fit_GLM(df_predictors_fit, srs_response_fit, model_name='Logistic', **parameters_glm_fit['glm_keyword_arguments'])

In [None]:
model_fit_results = pd.Series(glm_model.coef_[0], index=df_predictors_fit.columns, name='coef').unstack(0)
model_fit_results.index = model_fit_results.index.astype(int)
axes = model_fit_results.sort_index().plot()
axes.set_title(f'GLM Coefficients Fit Results — Response: {parameters_glm_fit["response"]}')