In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.linear_model import ElasticNet
from joblib import dump
from matplotlib import pyplot as plt
from tqdm import tqdm

os.environ['PYTHONHASHSEED'] = '0'
random.seed(0)
np.random.seed(0)

plt.rc('figure', figsize=(6, 6))
plt.rc('font', size=10)

In [None]:
df_meta = pd.read_csv('allen brain patchseq data of 385 cells human/human_mouse_ephys_all_0127_sorted283humanOnly.csv', index_col='specimen_id')
ephys_list = ['latency_rheo', 'upstroke_downstroke_ratio_long_square', 'width_long_square']
df_meta = df_meta[ephys_list]
df_meta = df_meta.dropna()
df_meta

In [None]:
df_meta = pd.read_excel('all ephys list.xlsx', index_col='#')
df_meta.index = df_meta.index.map(lambda x: x.replace('R0', 'R'))
ephys_list = ['Input resistance (MOhm)', 'AP amplitude (mV)', 'Max number of APs']
df_meta = df_meta[ephys_list]
df_meta = df_meta.dropna()
df_meta

In [None]:
df = pd.read_csv('allen_preds/preds_by_primary_gbm_2000perCellType_num_classes_8_scores.csv', sep=',', index_col='individual')
# df = df.drop(columns=['Unnamed: 0', 'group'])
df = df.iloc[:, :-2]
df

In [None]:
embs_directory = 'combined_patchseq_all_preds/'
emb_layer = ['preds', 'scores'][1]
embs_files = [x for x in os.listdir(embs_directory) if x.endswith(f'{emb_layer}.csv')]
embs_files = sorted(embs_files)
embs_files

In [None]:
output_directory = f'{embs_directory}{" ".join(ephys_list)} ElasticNet_emb_layer_{emb_layer}/'
os.mkdir(output_directory)

In [None]:
df_merged = None
for file_name in embs_files:
    df = pd.read_csv(embs_directory + file_name, sep=',', index_col='individual')
    df = df[df.index.isin(df_meta.index.tolist())]
    
    if emb_layer in [-1, 0]:
        df = df.drop(columns=['Unnamed: 0', 'group'])
        assert df.shape[1] == 256
    elif emb_layer == 'features':
        df = df.iloc[:, :-2]
        assert df.shape[1] == 32
    else:
        if emb_layer == 'scores':
            df = df.iloc[:, :-2]
        assert df.shape[1] == int(file_name.split('_')[-2])
    
    df.columns = [f"{file_name.replace('.csv', '')}_dim_{x}" for x in df.columns.tolist()]
    
    if df_merged is None:
        df_merged = df.copy()
    else:
        df_merged = pd.merge(df_merged, df, how='inner', left_index=True, right_index=True)

df_merged = df_merged.sample(frac=1.0)
df_merged

In [None]:
embs = df_merged.values
n_train = 70

scaler = StandardScaler()
X_train = scaler.fit_transform(embs[:n_train])
X_test = scaler.transform(embs[n_train:])

pca = PCA(10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

dump(scaler, output_directory + 'scaler.joblib')
dump(pca, output_directory + 'pca.joblib')

X_dict = {'embs': [embs[:n_train], embs[n_train:]], 
          'pcs': [X_train, X_test]
          }
for k, v in X_dict.items():
    print(k, v[0].shape, v[1].shape)

In [None]:
df_meta = df_meta.loc[df_merged.index.tolist(), :]
df_meta

In [None]:
y_dict = {}
for ephy in ephys_list:
    y_train = df_meta[ephy].values[:n_train]
    y_test = df_meta[ephy].values[n_train:]
    y_dict[ephy] = [y_train, y_test] # original y_train and y_test
    if np.amin(y_train) >= 0:
        y_dict[f'{ephy}_log'] = [np.log(y_train + 1), y_test] # log y_train, keep y_test

for k, v in y_dict.items():
    print(k, v[0].shape, v[1].shape)

In [None]:
for y_name, y_tup in tqdm(y_dict.items()):
    y_train, y_test = y_tup

    for X_name, X_tup in X_dict.items():
        X_train, X_test = X_tup

        model = ElasticNet()
        grid = {'alpha': np.arange(0, 1, 0.05), 'l1_ratio': np.arange(0, 1, 0.05), 'positive': [True]}
        
        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)
        search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        results = search.fit(X_train, y_train)
        alpha = results.best_params_['alpha']
        l1_ratio = results.best_params_['l1_ratio']

        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, positive=True)
        model.fit(X_train, y_train)
        y_predict = model.predict(X_test)

        if y_name.endswith('_log'):
            y_predict = np.exp(y_predict) - 1

        mae = np.mean(np.abs(y_predict - y_test))
        mae = np.round(mae, 3) if np.round(mae, 3) > 0.01 else np.round(mae, 8)
        
        output_prefix = f'prediction of {y_name} by {X_name} alpha {alpha} l1_ratio {l1_ratio} MAE {mae}'
        dump(model, output_directory + f'{output_prefix}.joblib')
        
        plt.figure()
        plt.scatter(y_test, y_predict)
        plt.title(output_prefix)
        plt.xlabel(f'{y_name} for test')
        plt.ylabel(f'{y_name} by prediction')
        figure = plt.gcf()
        figure.patch.set_facecolor('white')
        for ex in ['pdf']:
            figure.savefig(output_directory + f'{output_prefix}.{ex}', bbox_inches='tight', dpi=300)
        plt.close('all')
