In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt

In [None]:
# Dataset contains cortical expression levels of 77 proteins
df = pd.read_csv("data/Data_Cortex_Nuclear.csv")
proteins = df.loc[:,'DYRK1A_N':'CaNA_N']
proteins = proteins.fillna(proteins.mean())

In [None]:
# Describe expression of BAD protein based on expression of other proteins
X = proteins.drop(columns='BAD_N')
y = proteins['BAD_N']
X, y = shuffle(X, y, random_state=42)

In [None]:

model = Lasso();
alpha = np.linspace(1e-7, 1e-5, 10);
params = {'alpha': alpha};
cv = GridSearchCV(model,params,cv=5);
cv.fit(X, y);
best_model = cv.best_estimator_;
best_model

In [None]:
coefs = cv.best_estimator_.coef_
n = len(coefs)
plt.figure(figsize=(15, 6))
plt.bar(range(n),coefs)
plt.xticks(np.arange(n), X.columns.values, rotation=90);
plt.show()

print("The following proteins are not very informative for predicting BAD expression levels:")
for i in range(n):
    if abs(coefs[i]) < 0.005:
        print(X.columns.values[i])
