### Try to use the package scikit learn on chr22 with real phenotype
Test whether the result of using chr22 snps to do predict how is the correlaton. 

In [12]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd
np.random.seed(1234)

In [13]:
chr_22 = pd.read_csv("/home/mcb/users/yzhu439/RA_Project/data/DREAM_RA_Responders_DosageData/Training_chr22.dos", sep=" ", header=None)
# transform the dataframe to array of shape patient by SNPs
chr22_array = chr_22.drop([0,2,3,4,5], axis=1).T

In [14]:
# convert extracted columns of dataframe to numpy array -> X data
chr22_array.columns = chr22_array.iloc[0]
dos_chr22 = chr22_array[1:]
dos_chr22_array = dos_chr22.values

In [15]:
# extract Response.deltaDAS from clinical text file as our Y data
pheno_df = pd.read_csv("/home/mcb/users/yzhu439/RA_Project/data/DREAM_RA_Responders_PhenoCov_Full.txt", sep=" ")
pheno = pheno_df['Response.deltaDAS'].values

###  first try fit lasso without adding covariates as additional features


In [16]:
# first split the data as 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(dos_chr22_array, pheno, test_size=0.2, random_state=10)

# initialize lasso model and start training
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# evaluation by MSE
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared}")




Mean Squared Error: 2.4119915003760917
R-squared: -0.005537425180190292


###  first try fit lasso with covariates added as additional features


In [17]:
# first read in the covariates file
cov_df = pd.read_csv("/home/mcb/users/yzhu439/RA_Project/data/Dosage_for_PCA/chr22_covariates.csv", sep="\t")
cov_df_filter = cov_df[['baselineDAS', 'Age', 'Gender']]
cov = cov_df_filter.values


In [18]:
# some patients do not have age or sex information
# Find the row indices where NaN values are present
nan_row_indices = np.any(np.isnan(cov), axis=1)
ambiguous_pt = np.where(nan_row_indices)[0]


# remove these ambiguous patients from cov and the patient by SNPs array
cov_filtered = np.delete(cov, ambiguous_pt, axis=0)
chr22_array_filtered = np.delete(dos_chr22_array, ambiguous_pt, axis=0)

# now combine the two, treating covariates as additional features
chr22_withCOV = np.hstack((cov_filtered, chr22_array_filtered))

# delete these ambiguous patients from pheno array
pheno_filtered = np.delete(pheno, ambiguous_pt, axis=0)

In [19]:
print(chr22_withCOV.shape)
print(pheno_filtered.shape)

(2417, 33748)
(2417,)


In [20]:
# first split the data as 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(chr22_withCOV, pheno_filtered, test_size=0.2, random_state=10)

# initialize lasso model and start training
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# evaluation by MSE
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

r_squared = r2_score(y_test, y_pred)

print(f"R-squared: {r_squared}")


Mean Squared Error: 1.9234603878062995
R-squared: 0.19066598405817292


### recreate covariates file without header

In [21]:
# read out the old covariates file
cov_df = pd.read_csv("/home/mcb/users/yzhu439/RA_Project/data/Dosage_for_PCA/chr22_covariates.csv", sep="\t")
cov_df

# save it setting header=None
cov_df.to_csv('/home/mcb/users/yzhu439/RA_Project/data/Dosage_for_PCA/chr22_covNoHeader.csv', header=False, index=False, sep='\t')  