In [21]:
import scvelo as scv
import scanpy as sc
import numpy as np
import pandas as pd
import random 
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error



In [22]:
# Load dataset with velocity values
vdata = sc.read_h5ad("velocity_adata.h5ad")

In [23]:
# Scale Ms and velocity layer with zero mean and unit variance 
    # sc.pp.scale adds most recent mean and std as variables to var

sc.pp.scale(vdata, layer='Ms')
sc.pp.scale(vdata, layer='velocity')

In [24]:
#Split data into train and test
    # watch out for time-ordering, randomly choose cells

test, train = train_test_split(vdata, test_size=0.5)

In [25]:
##### Velocity genes
# No Ccl5 included, might be nice to check..

velocity_genes = vdata.var.index[vdata.var['velocity_genes'] == True].tolist()

In [26]:
# independent variables 
    #transcription factors
    
predictors = ['Klf2', 'Mcm3', 'Mcm5', 'Hmgb2', 'Cdk4', 'Hif1a', 'Mcm6', 'Tox']

In [49]:
# target
target = ['Rasgrp2']
#target = velocity_genes

In [50]:
# train and test set
X_train, y_train= train[:, predictors].layers['Ms'], train[:, target].layers['velocity']
X_test, y_test = test[:, predictors].layers['Ms'], test[:, target].layers['velocity']

In [51]:
# simple linear regression
lin = LinearRegression()
# prepare cross validation, data shuffled before split into batches
cv = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_validate(lin, X_train, y_train, cv=10, return_train_score=True, return_estimator=True)

In [76]:
scores['estimator'][0].coef_[0]

-0.6617897

In [72]:
x

TypeError: 'LinearRegression' object is not subscriptable

In [78]:
d = pd.DataFrame()
for x in scores['estimator']:
    d['predictors'], d['coeffecients'], d['target']= predictors, x.coef_[0], target*len(predictors)
d

Unnamed: 0,predictors,coeffecients,target
0,Klf2,-0.675841,Rasgrp2
1,Mcm3,0.061999,Rasgrp2
2,Mcm5,0.368303,Rasgrp2
3,Hmgb2,-0.106418,Rasgrp2
4,Cdk4,0.02183,Rasgrp2
5,Hif1a,0.141057,Rasgrp2
6,Mcm6,-0.273151,Rasgrp2
7,Tox,-0.128748,Rasgrp2


In [44]:
df = pd.DataFrame.from_dict(scores)
df

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,0.011511,0.001817,LinearRegression(),0.463953,0.465921
1,0.011618,0.001018,LinearRegression(),0.474662,0.464682
2,0.008431,0.000993,LinearRegression(),0.483006,0.463743
3,0.003038,0.000957,LinearRegression(),0.429439,0.4699
4,0.00297,0.000837,LinearRegression(),0.443382,0.468335
5,0.002816,0.000867,LinearRegression(),0.454684,0.467109
6,0.002733,0.00078,LinearRegression(),0.498337,0.462133
7,0.002645,0.000755,LinearRegression(),0.456556,0.466699
8,0.002493,0.000733,LinearRegression(),0.488514,0.463152
9,0.002509,0.000731,LinearRegression(),0.443097,0.468058
