In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer

import tensorflow as tf
from tensorflow import keras


import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("CRISPRGeneEffect.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID', 'OncotreePrimaryDisease']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreePrimaryDisease")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreePrimaryDisease'].unique()))}
gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)


In [4]:
X = gene_df.drop(columns = ['Disease as Number', 'OncotreePrimaryDisease'])
y = gene_df['Disease as Number']

# Linear Regression

### Normal

In [5]:
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X, y, test_size=0.1)

model_linear = LinearRegression()
model_linear.fit(X_train_linear, y_train_linear)

y_pred_linear = model_linear.predict(X_test_linear)

mse = mean_squared_error(y_test_linear, y_pred_linear)
r2 = r2_score(y_test_linear, y_pred_linear)

print(r2)

0.12431875133444503


### Cross Validation

In [6]:
cv = KFold(n_splits = 10, random_state = 1, shuffle=True)

cv_scores_linear = cross_val_score(model_linear, X, y, cv=cv)
avg_score_linear = np.mean(cv_scores_linear)

print(avg_score_linear)

0.24886373681398802


# Logistic Regression

### Normal

In [7]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.6


### Cross Validation

In [8]:
kf = KFold(n_splits = 10, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.70909091 0.6        0.55454545 0.60909091 0.64545455 0.64545455
 0.63636364 0.61818182 0.69090909 0.64545455]
0.6354545454545455


#  KNN

### Normal

In [9]:
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, y, test_size=0.1)

model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_knn, y_train_knn)

y_pred_knn = model_knn.predict(X_test_knn)

accuracy_knn = accuracy_score(y_test_knn, y_pred_knn)
print(accuracy_knn)

0.2636363636363636


### Cross Validation

In [10]:
kf_knn = KFold(n_splits = 10, random_state = 1, shuffle=True)

cv_scores_knn = cross_val_score(model_knn, X, y, cv = kf_knn)
avg_score_knn = np.mean(cv_scores_knn)
print(cv_scores_knn)
print(avg_score_knn)

[0.39090909 0.23636364 0.14545455 0.3        0.30909091 0.26363636
 0.3        0.26363636 0.3        0.27272727]
0.27818181818181814


# Random Forest

### Normal

In [11]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.1)

model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train_rf, y_train_rf)

y_pred_rf = model_rf.predict(X_test_rf)

accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)
print(accuracy_rf)

0.41818181818181815


### Cross Validation

In [12]:
kf_rf = KFold(n_splits = 10, random_state = 1, shuffle = True)

cv_scores_rf = cross_val_score(model_rf, X, y, cv = kf_rf)
avg_score_rf = np.mean(cv_scores_rf)
print(cv_scores_rf)
print(avg_score_rf)

[0.4        0.36363636 0.36363636 0.39090909 0.36363636 0.47272727
 0.41818182 0.37272727 0.39090909 0.43636364]
0.39727272727272733


# Gradient Boosting

### Normal

In [13]:
# X_train_gb, X_test_gb, y_train_gb, y_test_gb = train_test_split(X, y, test_size=0.1)

# model_gb = GradientBoostingClassifier(n_estimators=100)
# # model_gb.fit(X_train_gb, y_train_gb)

# y_pred_gb = model_gb.predict(X_test_gb)

# accuracy_gb = accuracy_score(y_test_gb, y_pred_gb)
# print(accuracy_gb)

### Cross Validation

In [14]:
# kf_gb = KFold(n_splits = 10, random_state = 1, shuffle = True)

# cv_scores_gb = cross_val_score(model_gb, X, y, cv = kf_gb)
# avg_score_gb = np.mean(cv_scores_gb)
# print(cv_scores_gb)
# print(avg_score_gb)

# Support Vector Machine

### Normal

In [15]:
X_train_sv, X_test_sv, y_train_sv, y_test_sv = train_test_split(X, y, test_size=0.1)

model_sv = SVC(kernel = 'rbf', gamma = 'auto')
model_sv.fit(X_train_sv, y_train_sv)

y_pred_sv = model_sv.predict(X_test_sv)

accuracy_sv = accuracy_score(y_test_sv, y_pred_sv)
print(accuracy_sv)

0.07272727272727272


### Cross Validation

In [16]:
kf_sv = KFold(n_splits = 10, random_state = 1, shuffle = True)

cv_scores_sv = cross_val_score(model_sv, X, y, cv = kf_sv)
avg_score_sv = np.mean(cv_scores_sv)
print(cv_scores_sv)
print(avg_score_sv)

[0.09090909 0.11818182 0.08181818 0.07272727 0.05454545 0.1
 0.08181818 0.07272727 0.13636364 0.05454545]
0.08636363636363636


# Neural Network

In [17]:
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X, y, test_size=0.1)


model = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(73, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

history = model.fit(X_train_nn, y_train_nn,
          batch_size=128, epochs=30,
          verbose=1,
          validation_data=(X_test_nn, y_test_nn))

model.evaluate(X_test_nn, y_test_nn)

Epoch 1/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 86ms/step - accuracy: 0.0312 - loss: 4.1791 - val_accuracy: 0.1091 - val_loss: 3.9579
Epoch 2/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0922 - loss: 3.8971 - val_accuracy: 0.1273 - val_loss: 3.9061
Epoch 3/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1348 - loss: 3.7935 - val_accuracy: 0.1000 - val_loss: 3.8986
Epoch 4/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1161 - loss: 3.6948 - val_accuracy: 0.1273 - val_loss: 3.8283
Epoch 5/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1606 - loss: 3.6742 - val_accuracy: 0.1636 - val_loss: 3.7813
Epoch 6/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.1756 - loss: 3.5289 - val_accuracy: 0.0818 - val_loss: 3.7560
Epoch 7/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━

[2.8289783000946045, 0.20909090340137482]

In [18]:
# model.evaluate(X_test_nn, y_test_nn)

X_cv = np.array(X)
y_cv = np.array(y)

# Define the number of folds
kf = KFold(n_splits=5, shuffle=True)

# for train_index, val_index in kf.split(X_cv):
#     X_train, X_val = X_cv[train_index], X_cv[val_index]
#     y_train, y_val = y_cv[train_index], y_cv[val_index]
    
# print(X_train.shape)
# print(X_val.shape)

model = keras.models.Sequential()

model = keras.models.Sequential([
#     keras.layers.Flatten(input_shape=[128, 128]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(73, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

# Perform k-fold cross-validation
fold_accuracies = []
for train_index, val_index in kf.split(X_cv):
    X_train, X_val = X_cv[train_index], X_cv[val_index]
    y_train, y_val = y_cv[train_index], y_cv[val_index]
    
    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
    
    # Evaluate the model on validation data
    _, accuracy = model.evaluate(X_val, y_val, verbose=1)
    fold_accuracies.append(accuracy)

# Calculate average accuracy
avg_accuracy = np.mean(fold_accuracies)
print("Average Accuracy:", avg_accuracy)

Epoch 1/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.0769 - loss: 4.1345
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0770 - loss: 3.7440
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0908 - loss: 3.6062
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1384 - loss: 3.5366
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1356 - loss: 3.4438
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1808 - loss: 3.3226
Epoch 7/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.2178 - loss: 3.2837
Epoch 8/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.2145 - loss: 3.1282
Epoch 9/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━

# PCA