In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("CRISPRGeneEffect.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID','CellLineName','OncotreeLineage']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreeLineage")
# new_cols.insert(2, "CellLineName")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}
gene_df['Lineage as Number'] = gene_df['OncotreeLineage'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

In [4]:
X = gene_df.drop(columns = ['Lineage as Number', 'OncotreeLineage'])
y = gene_df['Lineage as Number']

# Linear Regression

### Normal

In [5]:
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X, y, test_size=0.1)

model_linear = LinearRegression()
model_linear.fit(X_train_linear, y_train_linear)

y_pred_linear = model_linear.predict(X_test_linear)

mse = mean_squared_error(y_test_linear, y_pred_linear)
r2 = r2_score(y_test_linear, y_pred_linear)

print(r2)

0.45580046640399585


### Cross Validation

In [6]:
cv = KFold(n_splits = 10, random_state = 1, shuffle=True)

cv_scores_linear = cross_val_score(model_linear, X, y, cv=cv)
avg_score_linear = np.mean(cv_scores_linear)

print(avg_score_linear)

0.3090769693580074


# Logistic Regression

### Normal

In [7]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.6909090909090909


### Cross Validation

In [9]:
kf = KFold(n_splits = 10, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.74545455 0.60909091 0.58181818 0.65454545 0.69090909 0.69090909
 0.69090909 0.66363636 0.69090909 0.74545455]
0.6763636363636363


#  KNN

### Normal

In [10]:
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, y, test_size=0.1)

model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_knn, y_train_knn)

y_pred_knn = model_knn.predict(X_test_knn)

accuracy_knn = accuracy_score(y_test_knn, y_pred_knn)
print(accuracy_knn)

0.2636363636363636


### Cross Validation

In [11]:
kf_knn = KFold(n_splits = 10, random_state = 1, shuffle=True)

cv_scores_knn = cross_val_score(model_knn, X, y, cv = kf_knn)
avg_score_knn = np.mean(cv_scores_knn)
print(cv_scores_knn)
print(avg_score_knn)

[0.41818182 0.29090909 0.19090909 0.36363636 0.33636364 0.29090909
 0.30909091 0.31818182 0.32727273 0.29090909]
0.3136363636363636


# Random Forest

### Normal

In [12]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.1)

model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train_rf, y_train_rf)

y_pred_rf = model_rf.predict(X_test_rf)

accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)
print(accuracy_rf)

0.37272727272727274


### Cross Validation

In [13]:
kf_rf = KFold(n_splits = 10, random_state = 1, shuffle = True)

cv_scores_rf = cross_val_score(model_rf, X, y, cv = kf_rf)
avg_score_rf = np.mean(cv_scores_rf)
print(cv_scores_rf)
print(avg_score_rf)

[0.52727273 0.32727273 0.34545455 0.36363636 0.43636364 0.43636364
 0.4        0.42727273 0.4        0.47272727]
0.4136363636363637


# Gradient Boosting

### Normal

In [14]:
# X_train_gb, X_test_gb, y_train_gb, y_test_gb = train_test_split(X, y, test_size=0.1)

# model_gb = GradientBoostingClassifier(n_estimators=100)
# model_gb.fit(X_train_gb, y_train_gb)

# y_pred_gb = model_gb.predict(X_test_gb)

# accuracy_gb = accuracy_score(y_test_gb, y_pred_gb)
# print(accuracy_gb)

### Cross Validation

In [15]:
# kf_gb = KFold(n_splits = 10, random_state = 1, shuffle = True)

# cv_scores_gb = cross_val_score(model_gb, X, y, cv = kf_gb)
# avg_score_gb = np.mean(cv_scores_gb)
# print(cv_scores_gb)
# print(avg_score_gb)

# Support Vector Machine

### Normal

In [16]:
X_train_sv, X_test_sv, y_train_sv, y_test_sv = train_test_split(X, y, test_size=0.1)

model_sv = SVC(kernel = 'rbf', gamma = 'auto')
model_sv.fit(X_train_sv, y_train_sv)

y_pred_sv = model_sv.predict(X_test_sv)

accuracy_sv = accuracy_score(y_test_sv, y_pred_sv)
print(accuracy_sv)

0.16363636363636364


### Cross Validation

In [17]:
kf_sv = KFold(n_splits = 10, random_state = 1, shuffle = True)
    
cv_scores_sv = cross_val_score(model_sv, X, y, cv = kf_sv)
avg_score_sv = np.mean(cv_scores_sv)
print(cv_scores_sv)
print(avg_score_sv)

[0.10909091 0.11818182 0.09090909 0.07272727 0.09090909 0.10909091
 0.1        0.13636364 0.17272727 0.08181818]
0.10818181818181818


# Neural Network

### Normal

In [18]:
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X, y, test_size=0.1)


model = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(28, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

history = model.fit(X_train_nn, y_train_nn,
          batch_size=128, epochs=30,
          verbose=1,
          validation_data=(X_test_nn, y_test_nn))

model.evaluate(X_test_nn, y_test_nn)

Epoch 1/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 88ms/step - accuracy: 0.0806 - loss: 3.2637 - val_accuracy: 0.0909 - val_loss: 3.1636
Epoch 2/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1166 - loss: 3.0931 - val_accuracy: 0.1545 - val_loss: 3.0632
Epoch 3/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.1591 - loss: 3.0131 - val_accuracy: 0.1000 - val_loss: 3.0038
Epoch 4/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1290 - loss: 2.9708 - val_accuracy: 0.2455 - val_loss: 2.9270
Epoch 5/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1976 - loss: 2.8886 - val_accuracy: 0.2364 - val_loss: 2.9205
Epoch 6/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.2175 - loss: 2.8411 - val_accuracy: 0.2182 - val_loss: 2.8220
Epoch 7/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━

[2.1976165771484375, 0.4727272689342499]

In [19]:
# historysgd = model.fit(X_train_nn, y_train_nn, epochs=30,
#                     validation_data=(X_test_nn, y_test_nn))

### Cross Validation

In [20]:
X_cv = np.array(X)
y_cv = np.array(y)

# Define the number of folds
kf = KFold(n_splits=5, shuffle=True)

# for train_index, val_index in kf.split(X_cv):
#     X_train, X_val = X_cv[train_index], X_cv[val_index]
#     y_train, y_val = y_cv[train_index], y_cv[val_index]
    
# print(X_train.shape)
# print(X_val.shape)

model = keras.models.Sequential()

model = keras.models.Sequential([
#     keras.layers.Flatten(input_shape=[128, 128]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(28, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

# Perform k-fold cross-validation
fold_accuracies = []
for train_index, val_index in kf.split(X_cv):
    X_train, X_val = X_cv[train_index], X_cv[val_index]
    y_train, y_val = y_cv[train_index], y_cv[val_index]
    
    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
    
    # Evaluate the model on validation data
    _, accuracy = model.evaluate(X_val, y_val, verbose=1)
    fold_accuracies.append(accuracy)

# Calculate average accuracy
avg_accuracy = np.mean(fold_accuracies)
print("Average Accuracy:", avg_accuracy)

Epoch 1/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.1009 - loss: 3.2225
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.1478 - loss: 3.0534
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.1524 - loss: 2.9696
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.2213 - loss: 2.7934
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1867 - loss: 2.7579
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.2386 - loss: 2.6308
Epoch 7/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.2659 - loss: 2.5256
Epoch 8/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2702 - loss: 2.4514
Epoch 9/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━

# PCA