In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
from scipy.stats import randint
import matplotlib.pyplot as plt

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

# 10000Features

## 1. Load & Cleaning Dataset

#### 1.1 Loading in training and prediction datasets

In [None]:
df_2020_gene = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_pbmc_gene_expression.tsv", sep='\t')
df_2020_specimen = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_specimen.tsv", sep='\t')
df_2020_subject = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2020LD_subject.tsv", sep='\t')
df_2021_gene = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_pbmc_gene_expression.tsv", sep='\t')
df_2021_specimen = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_specimen.tsv", sep='\t')
df_2021_subject = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/training_data/2021LD_subject.tsv", sep='\t')
df_2022_gene = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_pbmc_gene_expression.tsv", sep='\t')
df_2022_specimen = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_specimen.tsv", sep='\t')
df_2022_subject = pd.read_csv("https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/prediction_data/2022BD_subject.tsv", sep='\t')

#### 1.2 Cleaning subject and gene expression datasets

In [None]:
## cleaning the subject/specimen dataset to get an age column
def clean_df_subject(df):
    
    ## Get age column
    df['year_of_birth'] = pd.to_numeric(df['year_of_birth'].str[:4])
    df['date_of_boost'] = pd.to_numeric(df['date_of_boost'].str[:4])
    df['age'] = df['date_of_boost'] - df['year_of_birth']
    return df

In [None]:
## cleaning the gene data to transform ensembl gene id to name
def clean_df_gene(df):
    df['versioned_ensembl_gene_id'] = df['versioned_ensembl_gene_id'].apply(lambda x: x.split('.')[0])
    #mg = mygene.MyGeneInfo()
    
    # extract symbol and gene type from the versioned_ensembl_gene_id
    #df['gene_symbol'] = df['versioned_ensembl_gene_id'].apply(lambda x : mg.getgene(x)['symbol'])
    #df['gene_type'] = df['versioned_ensembl_gene_id'].apply(lambda x : mg.getgene(x)['type_of_gene'])
    return df

In [None]:
#all_genes = genes['versioned_ensembl_gene_id'].unique()
#symbols = []
#for i in all_genes:
#    symbols.append(mg.getgene(i)['symbol'])

#### 1.3 EDA: finding which genes closely relate to CCL3 gene

In [None]:
## concatenating 2020 and 2021 gene datasets
genes = pd.concat([clean_df_gene(df_2020_gene),clean_df_gene(df_2021_gene)]).reset_index()
genes_cols = genes.pivot_table(index=['specimen_id'], columns=['versioned_ensembl_gene_id'], values= 'tpm')#.reset_index()

In [None]:
X = genes_cols.loc[:, genes_cols.columns !='ENSG00000277632']  #independent columns
y = genes_cols['ENSG00000277632']    #target column i.e CCL3

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=f_regression, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
top1000 = list(featureScores.nlargest(10000,'Score')['Specs'])
top1000.append('ENSG00000277632')
final_gene = genes_cols[genes_cols.columns.intersection(top1000)].rename(columns= {'ENSG00000277632':'CCL3'}).reset_index()
final_gene                                                  

#### 1.4 Merging datasets

In [None]:
## concatenating 2020 and 2021 subject & specimen datasets
subject = clean_df_subject(pd.concat([pd.merge(df_2020_specimen,df_2020_subject, on= 'subject_id'),
pd.merge(df_2021_specimen,df_2021_subject, on= 'subject_id')],ignore_index=True))

In [None]:
## merging the titer & subject/specimen datasets to one dataframe
subject = subject[['subject_id',
                   'specimen_id',
                   'planned_day_relative_to_boost',
         'infancy_vac',
         'biological_sex',
         'age',
         'year_of_birth',
         'ethnicity',
         'race']]
df = subject.merge(final_gene, on='specimen_id')

## 2. Data Feature Selection & Transformation

In [None]:
df_d0 = df[df['planned_day_relative_to_boost'] == 0.0].rename(columns = {'CCL3':'CCL3_d0'})
df_d3 = df[df['planned_day_relative_to_boost'] == 3.0][['subject_id', 'CCL3']]
df_d3 = df_d3.merge(df_d0, on = 'subject_id')
df_d3['CCL3-FC'] = np.log2(df_d3['CCL3'] / df_d3['CCL3_d0'])
df_d3 = df_d3.drop(['subject_id', 'specimen_id', 'planned_day_relative_to_boost'], axis = 1)

#### 2.1 Data Feature Transformation
- mapping string data into numerical type for Regression training

In [None]:
df_d3['infancy_vac'] = df_d3['infancy_vac'].map({'wP':0, 'aP':1})

In [None]:
df_d3['biological_sex'] = df_d3['biological_sex'].map({'Female':0, 'Male':1})

In [None]:
df_d3['ethnicity'] = df_d3['ethnicity'].map({'Not Hispanic or Latino':0, 
                                             'Hispanic or Latino':1,
                                             'Unknown':2})

In [None]:
df_d3['race'] = df_d3['race'].map({'White':0, 
                                   'Asian':1,
                                   'Unknown or Not Reported': 2,
                                   'More Than One Race': 2,
                                   'Black or African American': 3,
                                   'Native Hawaiian or Other Pacific Islander': 4,
                                   'American Indian/Alaska Native':5})
df_d3.head()

## 3. Training and Evaluating Models

#### 3.1 Creating training and testing datasets

In [None]:
X = df_d3.drop(['CCL3','CCL3-FC'], axis = 1)
y = df_d3['CCL3-FC']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = Normalizer()
X_train_scaled = X_train#pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = X_test#pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

#### 3.2 Linear Regression 

First we will test a simple naive model, Linear Regression, to get a baseline of how a model should at least perform

In [None]:
reg = LinearRegression().fit(X, y)
Y_pred = reg.predict(X)

In [None]:
mse = mean_squared_error(y, Y_pred)
mae = mean_absolute_error(y, Y_pred)
r2 = r2_score(y, Y_pred)
mse, mae, r2

The MSE, MAE, and R2 scores were very good indicating that the model was good at predicting among its own training data, though the real evaluation comes with training the model with the training dataset, and predicting values for the testing subset of data. This will allow us to better evaluate if the model's actually good at predicitng with new data points.

In [None]:
reg_sub = LinearRegression().fit(X_train, y_train)
Y_pred = reg_sub.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, Y_pred)
mae = mean_absolute_error(y_test, Y_pred)
r2 = r2_score(y_test, Y_pred)
mse, mae, r2

While the MSE and MAE scores performed better, the R2 indicates that the model is heavily not accurate in its predictions and could definitely be improved upon. So we are going to be testing more advanced models and seeing any improvements

#### 3.2 Elastic Net

In [None]:
en = ElasticNet()
en.fit(X, y)
Y_pred = en.predict(X)

In [None]:
mse = mean_squared_error(y, Y_pred)
mae = mean_absolute_error(y, Y_pred)
r2 = r2_score(y, Y_pred)
mse, mae, r2

In [None]:
en_sub = ElasticNet()
en_sub.fit(X_train, y_train)
Y_pred = en_sub.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, Y_pred)
mae = mean_absolute_error(y_test, Y_pred)
r2 = r2_score(y_test, Y_pred)
mse, mae, r2

#### 3.3 KNeighborsRegressor

Since our training data we are working with is fairly constricted in size, we will take a look at how KNeighborsRegressor will perform

In [None]:
kn = KNeighborsRegressor()
kn.fit(X, y)
Y_pred = kn.predict(X)

In [None]:
mse = mean_squared_error(y, Y_pred)
mae = mean_absolute_error(y, Y_pred)
r2 = r2_score(y, Y_pred)
mse, mae, r2

In [None]:
kn_sub = KNeighborsRegressor()
kn_sub.fit(X_train, y_train)
Y_pred = kn_sub.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, Y_pred)
mae = mean_absolute_error(y_test, Y_pred)
r2 = r2_score(y_test, Y_pred)
mse, mae, r2

The results are promising with a R2 score of 0.57 on a scale from 0 to 1, alot better than our baseline with Linear Regression of -4.15

#### 3.4 DecisionTreeRegressor

In [None]:
regr_1 = DecisionTreeRegressor(max_depth=5)
regr_2 = DecisionTreeRegressor(max_depth=10)
regr_3 = DecisionTreeRegressor(max_depth=15)
regr_1.fit(X, y)
regr_2.fit(X, y)
regr_3.fit(X, y)
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)
y_3 = regr_3.predict(X)

In [None]:
mse = mean_squared_error(y, y_1)
mae = mean_absolute_error(y, y_1)
r2 = r2_score(y, y_1)
mse, mae, r2

In [None]:
mse = mean_squared_error(y, y_2)
mae = mean_absolute_error(y, y_2)
r2 = r2_score(y, y_2)
mse, mae, r2

In [None]:
mse = mean_squared_error(y, y_3)
mae = mean_absolute_error(y, y_3)
r2 = r2_score(y, y_3)
mse, mae, r2

In [None]:
regr_1_sub = DecisionTreeRegressor(max_depth=5)
regr_2_sub = DecisionTreeRegressor(max_depth=10)
regr_3_sub = DecisionTreeRegressor(max_depth=15)
regr_1_sub.fit(X_train, y_train)
regr_2_sub.fit(X_train, y_train)
regr_3_sub.fit(X_train, y_train)
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
y_3 = regr_3.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_1)
mae = mean_absolute_error(y_test, y_1)
r2 = r2_score(y_test, y_1)
mse, mae, r2

In [None]:
mse = mean_squared_error(y_test, y_2)
mae = mean_absolute_error(y_test, y_2)
r2 = r2_score(y_test, y_2)
mse, mae, r2

In [None]:
mse = mean_squared_error(y_test, y_3)
mae = mean_absolute_error(y_test, y_3)
r2 = r2_score(y_test, y_3)
mse, mae, r2

#### 3.5 RandomForestRegressor

In [None]:
rf = RandomForestRegressor()
rf.fit(X, y)
Y_pred = rf.predict(X)

In [None]:
mse = mean_squared_error(y, Y_pred)
mae = mean_absolute_error(y, Y_pred)
r2 = r2_score(y, Y_pred)
mse, mae, r2

In [None]:
rf_sub = RandomForestRegressor()
rf_sub.fit(X_train, y_train)
Y_pred = rf.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, Y_pred)
mae = mean_absolute_error(y_test, Y_pred)
r2 = r2_score(y_test, Y_pred)
mse, mae, r2

#### 3.5 Feature Evaluation with Lasso and Random Forest Regression

#### 3.6 Finding the best model

In [None]:
X_train, X_test, y_train, y_test = X,X,y,y

#train_test_split(X, y, test_size=0.2)
scaler = Normalizer()
X_train_scaled = X_train#pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = X_test#pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_train_scaled_selected = X_train_scaled
X_test_scaled_selected = X_test_scaled

In [None]:
# Random Forest Regression
rf_model = RandomForestRegressor()
param_dist_rf = {
    'n_estimators': [10,30,40,50,100,200, 300, 400,500,600,700],
    'max_depth': [2,3, 4,5 ,6 ,7, 8, 9, 10,13,15,17,20],
    'min_samples_split': [5, 10, 15,17,18,19,20],
    'min_samples_leaf': [1, 2, 3]
}

# Randomized Search CV for Random Forest
random_search_rf = RandomizedSearchCV(rf_model,
                                      param_distributions=param_dist_rf, 
                                      n_iter=10, 
                                      scoring='r2',  
                                      cv=5, 
                                      random_state=42)
random_search_rf.fit(X_train_scaled_selected, y_train)

# Print best parameters for Random Forest
print("Random Forest - Best Parameters:", random_search_rf.best_params_)

# Evaluate Random Forest on the test set using R-squared
y_pred_rf = random_search_rf.predict(X_test_scaled_selected)
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest - R-squared on Test Set:", r2_rf)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled_selected, y_train)

# Evaluate Linear Regression on the test set using R-squared
y_pred_lr = lr_model.predict(X_test_scaled_selected)
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression - R-squared on Test Set:", r2_lr)

# Support Vector Regression
svr_model = SVR()
param_dist_svr = {
    'C': [5, 10, 15],
    'kernel': ['linear', 'rbf']
}

# Randomized Search CV for SVR

#random_search_svr = RandomizedSearchCV(svr_model, param_distributions=param_dist_svr, n_iter=5, scoring='r2', 
#                                       cv=5, random_state=42)
#random_search_svr.fit(X_train_scaled_selected, y_train)

# Print best parameters for SVR
#print("SVR - Best Parameters:", random_search_svr.best_params_)

# Evaluate SVR on the test set using R-squared
#y_pred_svr = random_search_svr.predict(X_test_scaled_selected)
#r2_svr = r2_score(y_test, y_pred_svr)
#print("SVR - R-squared on Test Set:", r2_svr)

# Gradient Boosting Regression
gb_model = GradientBoostingRegressor()
param_dist_gb = {
    'n_estimators': [35,36,37,38,39,40,41,42,43],
    'learning_rate': [0.001,0.01,0.02,0.03, 0.05, 0.08,0.1,0.2,0.5,1],
    'max_depth': [2, 3, 4],
    'min_samples_split': [2, 3, 4,6,8,10,15,20],
    'min_samples_leaf': [1, 2, 3]
}

# Randomized Search CV for Gradient Boosting
random_search_gb = RandomizedSearchCV(gb_model, param_distributions=param_dist_gb, n_iter=10, scoring='r2',  
                                       cv=5, random_state=42)
random_search_gb.fit(X_train_scaled_selected, y_train)

# Print best parameters for Gradient Boosting
print("Gradient Boosting - Best Parameters:", random_search_gb.best_params_)

# Evaluate Gradient Boosting on the test set using R-squared
y_pred_gb = random_search_gb.predict(X_test_scaled_selected)
r2_gb = r2_score(y_test, y_pred_gb)
print("Gradient Boosting - R-squared on Test Set:", r2_gb)

# Lasso Regression
lasso_model = Lasso()
param_dist_lasso = {
    'alpha': [0.001, 0.01, 0.1, 1, 10]
}

# Randomized Search CV for Lasso
random_search_lasso = RandomizedSearchCV(lasso_model, param_distributions=param_dist_lasso, n_iter=5, scoring='r2', 
                                         cv=5, random_state=42)
random_search_lasso.fit(X_train_scaled_selected, y_train)

# Print best parameters for Lasso
print("Lasso - Best Parameters:", random_search_lasso.best_params_)

# Evaluate Lasso on the test set using R-squared
y_pred_lasso = random_search_lasso.predict(X_test_scaled_selected)
r2_lasso = r2_score(y_test, y_pred_lasso)
print("Lasso - R-squared on Test Set:", r2_lasso)

# Ridge Regression
ridge_model = Ridge()
param_dist_ridge = {
    'alpha': [0.001, 0.01, 0.1, 1, 10]
}

# Randomized Search CV for Ridge
random_search_ridge = RandomizedSearchCV(ridge_model, param_distributions=param_dist_ridge, n_iter=5, scoring='r2', 
                                         cv=5, random_state=42)
random_search_ridge.fit(X_train_scaled_selected, y_train)

# Print best parameters for Ridge
print("Ridge - Best Parameters:", random_search_ridge.best_params_)

# Evaluate Ridge on the test set using R-squared
y_pred_ridge = random_search_ridge.predict(X_test_scaled_selected)
r2_ridge = r2_score(y_test, y_pred_ridge)
print("Ridge - R-squared on Test Set:", r2_ridge)

# ElasticNet Regression
elasticnet_model = ElasticNet()
param_dist_elasticnet = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Randomized Search CV for ElasticNet
random_search_elasticnet = RandomizedSearchCV(elasticnet_model, param_distributions=param_dist_elasticnet, n_iter=10, scoring='r2', 
                                               cv=5, random_state=42)
random_search_elasticnet.fit(X_train_scaled_selected, y_train)

# Print best parameters for ElasticNet
print("ElasticNet - Best Parameters:", random_search_elasticnet.best_params_)

# Evaluate ElasticNet on the test set using R-squared
y_pred_elasticnet = random_search_elasticnet.predict(X_test_scaled_selected)
r2_elasticnet = r2_score(y_test, y_pred_elasticnet)
print("ElasticNet - R-squared on Test Set:", r2_elasticnet)

# Decision Tree Regression
dt_model = DecisionTreeRegressor()
param_dist_dt = {
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10,15,20],
    'min_samples_leaf': [1, 2, 4,6,8,10,15,20,25,30,35]
}

# Randomized Search CV for Decision Tree
random_search_dt = RandomizedSearchCV(dt_model, param_distributions=param_dist_dt, n_iter=5, scoring='r2', 
                                      cv=5, random_state=42)
random_search_dt.fit(X_train_scaled_selected, y_train)

# Print best parameters for Decision Tree
print("Decision Tree - Best Parameters:", random_search_dt.best_params_)

# Evaluate Decision Tree on the test set using R-squared
y_pred_dt = random_search_dt.predict(X_test_scaled_selected)
r2_dt = r2_score(y_test, y_pred_dt)
print("Decision Tree - R-squared on Test Set:", r2_dt)

# K-Nearest Neighbors Regression
knn_model = KNeighborsRegressor()
param_dist_knn = {
    'n_neighbors': np.arange(1, 20),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Randomized Search CV for KNN
random_search_knn = RandomizedSearchCV(knn_model, param_distributions=param_dist_knn, n_iter=10, scoring='r2', 
                                        cv=5, random_state=42)
random_search_knn.fit(X_train_scaled_selected, y_train)

# Print best parameters for KNN
print("KNN - Best Parameters:", random_search_knn.best_params_)

# Evaluate KNN on the test set using R-squared
y_pred_knn = random_search_knn.predict(X_test_scaled_selected)
r2_knn = r2_score(y_test, y_pred_knn)
print("KNN - R-squared on Test Set:", r2_knn)

# Compare r2 of different models
r2_dict = {'Random Forest': r2_rf, 'Linear Regression': r2_lr, 'Gradient Boosting': r2_gb,
            'Lasso': r2_lasso, 'Ridge': r2_ridge, 'ElasticNet': r2_elasticnet, 
            'Decision Tree': r2_dt, 'KNN': r2_knn}

# Find the model with the highest r2
best_model_name = max(r2_dict, key=r2_dict.get)
best_model = None

# Select the best model
if best_model_name == 'Random Forest':
    best_model = random_search_rf
elif best_model_name == 'Linear Regression':
    best_model = lr_model
#elif best_model_name == 'SVR':
#    best_model = random_search_svr
elif best_model_name == 'Gradient Boosting':
    best_model = random_search_gb
elif best_model_name == 'Lasso':
    best_model = random_search_lasso
elif best_model_name == 'Ridge':
    best_model = random_search_ridge
elif best_model_name == 'ElasticNet':
    best_model = random_search_elasticnet
elif best_model_name == 'Decision Tree':
    best_model = random_search_dt
elif best_model_name == 'KNN':
    best_model = random_search_knn

print(f"\nBest Model: {best_model_name} with R-squared: {r2_dict[best_model_name]}")

In [None]:
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error

y_pred_best_model = best_model.predict(X_test_scaled_selected)
r2_best_model = r2_score(y_test, y_pred_best_model)
explained_variance_best_model = explained_variance_score(y_test, y_pred_best_model)
rmse_best_model = mean_squared_error(y_test, y_pred_best_model, squared=False)
r2_best_model = r2_score(y_test, y_pred_best_model)
explained_variance_best_model = explained_variance_score(y_test, y_pred_best_model)
rmse_best_model = mean_squared_error(y_test, y_pred_best_model, squared=False)
mse_best_model = mean_squared_error(y_test, y_pred_best_model)
mae_best_model = mean_absolute_error(y_test, y_pred_best_model)

print(f"Best Model: {best_model_name}")
print(f"MSE: {mse_best_model}")
print(f"MAE: {mae_best_model}")
print(f"R-squared: {r2_best_model}")
print(f"Explained Variance: {explained_variance_best_model}")
print(f"RMSE: {rmse_best_model}")

In [None]:
r2_dict

In [None]:
mods = list(r2_dict.keys())
og = list(r2_dict.values())

In [None]:
import plotly.graph_objects as px
plot = px.Figure(data=[
                       px.Bar(
    name = 'Original Model',
    x = mods,
    y = og
   )
])

plot.update_layout(title= 'Comparison of R-squared Values Across Different Models', title_x=0.5, 
                   xaxis_title = 'Model', yaxis_title='R-squared value',
                  width=800,height=600, plot_bgcolor= 'rgba(0, 0, 0, 0)',)
#paper_bgcolor= 'rgba(0, 0, 0, 0)',)
plot.update_traces(marker_color = 'LightGreen')
plot.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
plot.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
#plot.update_xaxes('Regression Model Type')
plot.show()



## 4. 2022 Validation Predictions

#### 4.1 Loading in 2022 prediction datasets: gene expression, specimen, and subject

In [None]:
gene_pred = clean_df_gene(df_2022_gene)

In [None]:
genes_cols_pred = gene_pred.pivot_table(index=['specimen_id'], columns=['versioned_ensembl_gene_id'], values= 'tpm')

In [None]:
final_gene_pred = genes_cols_pred[genes_cols_pred.columns.intersection(top10)].rename(columns= {'ENSG00000277632':'CCL3'}).reset_index()
final_gene_pred      

In [None]:
subject_pred = clean_df_subject(pd.merge(df_2022_specimen,df_2022_subject, on= 'subject_id'))

In [None]:
subject_pred.merge(final_gene_pred, on='specimen_id', how = 'right')