In [1]:
# import glm_utils
import glm_utils.preprocessing, glm_utils.bases
import matplotlib.pyplot as plt
import numpy as np
import sklearn.model_selection, sklearn.metrics, sklearn.linear_model
from scipy.stats import zscore, binned_statistic, pearsonr
import os
import pickle
import pandas as pd
from DN_tools import load_into_pandas, bases_dict, load_recording, get_xy, lp, chunked_test_train_split
from tqdm import tqdm

# Parameters

In [6]:
raw_data_dir_path = 'W:/apalaci/code/janache'
results_dir_path = 'W:/apalaci/code/janache/res'
sample_frequency = 20000
isplit = 0
redo_analysis = True
bin_width = 100; decimating_values = [10,10]
datefolder = f'{results_dir_path}/2025_glms_saving_idxs'
y_names = ['v_fwd','abs_v_fwd']
cutoff = 20
block_size = 5_000; n_block_min=5; test_size = 0.35; random_state = 42; starting_nbins = 32
window = bases_dict['window']

# Load dataframe

In [7]:
df = load_into_pandas(dir_path=raw_data_dir_path)
df = df[df.to_ignore == False].reset_index(drop=True)
df

Unnamed: 0,filename,#Fly,#Trial,#Cell,side,DN,to_ignore,abs_file_path
0,2022_06_08_0004,1.0,1.0,1.0,right,imposter,False,W:/apalaci/code/janache/DN_SPEEDO/2022_06_08_0...
1,2022_06_08_0005,1.0,2.0,1.0,right,imposter,False,W:/apalaci/code/janache/DN_SPEEDO/2022_06_08_0...
2,2022_07_04_0001,5.0,1.0,1.0,undefined,imposter,False,W:/apalaci/code/janache/DN_SPEEDO/2022_07_04_0...
3,2022_07_04_0014,5.0,1.0,2.0,left,imposter,False,W:/apalaci/code/janache/DN_SPEEDO/2022_07_04_0...
4,2022_07_04_0021,5.0,2.0,2.0,undefined,imposter,False,W:/apalaci/code/janache/DN_SPEEDO/2022_07_04_0...
...,...,...,...,...,...,...,...,...
81,2024_10_14_0002,,,,undefined,Roadrunner,False,W:/apalaci/code/janache/Roadrunner/2024_10_14_...
82,2024_11_04_0010,,,,undefined,Roadrunner,False,W:/apalaci/code/janache/Roadrunner/2024_11_04_...
83,2024_12_05_0004,,,,undefined,Roadrunner,False,W:/apalaci/code/janache/Roadrunner/2024_12_05_...
84,2024_12_10_0000,,,,undefined,Roadrunner,False,W:/apalaci/code/janache/Roadrunner/2024_12_10_...


# Analysis

In [8]:
def chunked_test_train_split(
    X_b,
    y_m,
    block_size=5_000,
    n_block_min=5,
    test_size=0.35,
    random_state=42
):
    """
    Divides data into test-train split, implementing chunks of manageable size
    for better handling of memory, unless size is already small enough.

    Returns:
        X_train, X_test, y_train, y_test, idx_train, idx_test
    """

    n_samples = X_b.shape[0]
    indices = np.arange(n_samples)

    if n_samples > n_block_min * block_size:
        num_blocks = n_samples // block_size

        # truncate to full blocks
        n_used = num_blocks * block_size

        X_b_chunked = X_b[:n_used].reshape(num_blocks, block_size, -1)
        y_m_chunked = y_m[:n_used].reshape(num_blocks, block_size, -1)
        idx_chunked = indices[:n_used].reshape(num_blocks, block_size)

        blocks_train, blocks_test = sklearn.model_selection.train_test_split(
            np.arange(num_blocks),
            random_state=random_state,
            shuffle=True,
            test_size=test_size
        )

        X_train = np.concatenate(X_b_chunked[blocks_train])
        X_test = np.concatenate(X_b_chunked[blocks_test])
        y_train = np.concatenate(y_m_chunked[blocks_train])
        y_test = np.concatenate(y_m_chunked[blocks_test])
        idx_train = np.concatenate(idx_chunked[blocks_train])
        idx_test = np.concatenate(idx_chunked[blocks_test])

    else:
        X_train, X_test, y_train, y_test, idx_train, idx_test = (
            sklearn.model_selection.train_test_split(
                X_b,
                y_m,
                indices,
                random_state=random_state,
                shuffle=True,
                test_size=test_size
            )
        )

    return X_train, X_test, y_train, y_test, idx_train, idx_test

In [13]:
if redo_analysis:
    B = glm_utils.bases.raised_cosine(neye = bases_dict['neye'], ncos = bases_dict['ncos'], kpeaks = bases_dict['kpeaks'], b = bases_dict['b'], nbasis = bases_dict['nbasis'])
    B = B[-window:]
    basis_projection = glm_utils.preprocessing.BasisProjection(B)

    for index, row in tqdm(df.iterrows(), total=len(df)):
        filename = row['filename']
        csv_path = row['abs_file_path']
        # print(index, filename)

        singleDN_df = load_recording(csv_path=csv_path)

        x, ys = get_xy(singleDN_df,y_names)

        y_means = np.mean(ys, axis=0)
        y_stds = np.std(ys, axis=0)
        y_zscored = zscore(ys, axis=0)

        X, y_m = glm_utils.preprocessing.time_delay_embedding(x, ys, window_size=window, flatten_inside_window=True, exclude_t0=True)   # ONLY CHANGE I MADE  y_zscored -> ys
        X_b = basis_projection.transform(X)
        # y_m = lp(y_m, cutoff=cutoff, fs=int(sample_frequency/bin_width))
        
        X_train, X_test, y_train, y_test, idx_train, idx_test = chunked_test_train_split(X_b,y_m,block_size=block_size,n_block_min=n_block_min,test_size=test_size,random_state=random_state)

        assert np.all(X_train == X_b[idx_train])
        assert np.all(X_test == X_b[idx_test])
        assert np.all(y_train == y_m[idx_train])
        assert np.all(y_test == y_m[idx_test])

        for iv, varname in enumerate(y_names):
            if (y_means[iv] == 0) and (y_stds[iv] == 0):
                print(f"{filename}, {varname} - y has constant or NaN values")
            else:
                # Fit
                lr = sklearn.linear_model.LassoCV(max_iter=20000)
                lr.fit(X_train, y_train[:,iv])

                # Predictions
                y_pred = lr.predict(X_train)
                y_pred_test = lr.predict(X_test)

                # Attempt to estimate nonlinearity (last record: ~12% constant value)
                nbins = starting_nbins
                not_finished = True
                while (nbins != 5) and not_finished:
                    bin_edges_quantilebased = np.quantile(y_pred, np.linspace(0, 1, nbins + 1))
                    try:
                        statistic, bin_edges, binnumber = binned_statistic(y_pred, y_train[:,iv], statistic='mean', bins=bin_edges_quantilebased, range=None)
                        bin_centers = bin_edges[:-1] + np.median(np.diff(bin_edges)) / 2
                        constant_input = False
                        not_finished = False
                    except ValueError as e:
                        if str(e) != 'The smallest edge difference is numerically 0.':
                            print("ValueError: ",e)
                        else:
                            nbins -= 1

                if (nbins == 5) and not_finished:
                    statistic = [np.nanmean(y_train[:,iv])]
                    bin_centers = [np.nanmean(y_pred)]
                    constant_input = True
                
                if np.sum(np.isnan(statistic)) > 0:
                    y_pred_test_nl = np.interp(y_pred_test, bin_centers[~np.isnan(statistic)], statistic[~np.isnan(statistic)])
                else:
                    y_pred_test_nl = np.interp(y_pred_test, bin_centers, statistic)
                
                # Scores
                r2_train = lr.score(X_train, y_train[:,iv])
                r2_test = lr.score(X_test, y_test[:,iv]) # equivalent to sklearn.metrics.r2_score(y_test,y_pred_test)
                pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
                if constant_input:
                    r2_nl = 0
                    pearsonr_score = pearsonr_score_linear
                else:
                    r2_nl = sklearn.metrics.r2_score(y_test[:,iv],y_pred_test_nl)
                    pearsonr_score = pearsonr(y_pred_test_nl, y_test[:,iv]).statistic
                # print(f"{filename} {varname} - score (linear): {pearsonr_score_linear:1.2f}, score: {pearsonr_score:1.2f}, train: {r2_train:1.2f}, test: {r2_test:1.2f}, nl: {r2_nl:1.2f} - constant {constant_input}, {nbins} bins")

                # Filter
                basis_weights = lr.coef_
                estimated_filters = basis_projection.inverse_transform(basis_weights)
                estimated_filters = estimated_filters.reshape((-1,window))

                # Store
                if not os.path.exists(f'{datefolder}'):
                    os.mkdir(f'{datefolder}')
                with open(f'{datefolder}/{filename}_{varname}_isplit{isplit}.pkl', 'wb') as handle:
                    pickle.dump({'estimated_filters':estimated_filters,'y_pred_test':y_pred_test,'y_pred_test_nl':y_pred_test_nl,'bin_centers':bin_centers,'statistic':statistic,'nsamples':len(x), 'duration':len(x)/int(sample_frequency/bin_width), 'constant_input':constant_input, 'nbins':nbins,'y_train':y_train[:,iv],'y_test':y_test[:,iv],'y_pred_train':y_pred, 'idx_train':idx_train, 'idx_test': idx_test, 'y_m':y_m}, handle, protocol=pickle.HIGHEST_PROTOCOL)

  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).statistic
  pearsonr_score_linear = pearsonr(y_pred_test, y_test[:,iv]).st