In [141]:
from sklearn import preprocessing, linear_model, metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [142]:
train_data = pd.read_csv("../../data/train_data.tsv", sep="\t")
test_data = pd.read_csv("../../data/test_data.tsv", sep="\t")
data = pd.read_csv("../../data/regression_data.tsv", sep="\t")

print(np.array(data.columns[3:]))

['zebularine_AUC' 'azacitidine_AUC' 'nelarabine_AUC' 'myricetin_AUC'
 'BRD-K64610608_AUC' 'ML334 diastereomer_AUC' 'BRD-K09344309_AUC'
 'isonicotinohydroxamic acid_AUC' 'QS-11_AUC' 'brivanib_AUC' 'BRD8958_AUC'
 'BRD-K34099515_AUC' 'A-804598_AUC' 'erismodegib_AUC' 'abiraterone_AUC'
 'ifosfamide_AUC' 'temozolomide_AUC' 'BRD-A05715709_AUC'
 'BRD-K48477130_AUC' 'CAY10594_AUC' 'WP1130_AUC' 'tamoxifen_AUC'
 'importazole_AUC' 'ML006_AUC' 'AM-580_AUC' 'CD-1530_AUC'
 'silmitasertib_AUC' 'PRL-3 inhibitor I_AUC' 'NPC-26_AUC'
 'betulinic acid_AUC' 'salermide_AUC' 'BRD-M00053801_AUC' 'AA-COCF3_AUC'
 'CI-976_AUC' 'pifithrin-alpha_AUC' 'purmorphamine_AUC' 'BIBR-1532_AUC'
 'niclosamide_AUC' 'bardoxolone methyl_AUC' 'hyperforin_AUC' 'BRD4132_AUC'
 'regorafenib_AUC' 'sorafenib_AUC' 'BRD-K51490254_AUC' 'EX-527_AUC'
 'bendamustine_AUC' 'PYR-41_AUC' 'B02_AUC' 'ML031_AUC' 'tipifarnib-P1_AUC'
 'BRD9876_AUC' 'parthenolide_AUC' 'tandutinib_AUC' 'CID-5951923_AUC'
 'DBeQ_AUC' 'LE-135_AUC' 'BRD-K24690302_AUC' 'ML

In [143]:
encoder = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist')

#rotate this between the different drug sensitivities... a bunch of them have R^2 close to 0, which means
#that trying to discriminate by site is about as effective as just guessing the overall mean of AUCs for that
#drug regardless of site
#e.g. site isn't a good predictor for AUC... means that even in the sites where there's low AUC for a certain
#drug & cell line, high AUC cell lines get in the way

#maybe there's something else we can do: namely, we could try and find the lowest 30 AUCs in each set and then 
#verify two things 1) verify that the mean of this sample is substantially different from the mean of the overall
#(or the mean of the rest)... and 2) verify that certain sites or histologies are "enriched" relative to the majority

#a t - test, then a chi sq test?

# however, the data i'm trying to fit is going to go
#up and down sharply (namely the AUC is going to be really low, indicating sensitivity, only for a few
#drugs and only for a few cell lines within those (so, is a "good model", a "high R^2 model", supposed to
#find the sites or histologies where a particular ))
response = 'myricetin_AUC'
predictors = ['site'] #['site', 'histology']

#encoding the features: leave the response variable out, just preprocessing the features
train_raw = train_data[[*predictors, response]].dropna()
test_raw = test_data[[*predictors, response]].dropna()
print(train_raw.shape, test_raw.shape)

train_X_raw = np.array(train_raw[predictors])
test_X_raw = np.array(test_raw[predictors])

#fit encoder to categories of the training data
encoder.fit(train_X_raw)

#transform train and test data sets, introducing dummy features and new values
train_X_enc = encoder.transform(train_X_raw).toarray()
test_X_enc = encoder.transform(test_X_raw).toarray()
print(train_X_enc.shape)
print(encoder.categories_)

(468, 2) (153, 2)
(468, 18)
[array(['Ewings_sarcoma_peripheral_primitive_neuroectodermal_tumor',
       'carcinoid_endocrine_tumor', 'carcinoma', 'chondrosarcoma',
       'fibrosarcoma', 'glioma', 'haematopoietic_neoplasm',
       'leiomyosarcoma', 'lymphoid_neoplasm', 'malignant_melanoma',
       'mesothelioma', 'neuroblastoma', 'osteosarcoma',
       'primitive_neuroectodermal_tumor_medulloblastoma',
       'rhabdoid_tumor', 'rhabdomyosarcoma', 'sarcoma',
       'sex_cord_stromal_tumor'], dtype=object)]


In [144]:
#example regression: use one of the drug AUCs as the response
train_Y = train_raw[response]

model = linear_model.LinearRegression()
model.fit(train_X_enc, train_Y)

Y_predict = model.predict(test_X_enc)
print(Y_predict)

# The coefficients
#print("Coefficients: \n", regr.coef_)
print(model.coef_)
# The mean squared error
print("Mean squared error: %.2f" % metrics.mean_squared_error(test_raw[response], Y_predict))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % metrics.r2_score(test_raw[response], Y_predict))
print("Explained variance: %.2f" % metrics.explained_variance_score(test_raw[response], Y_predict))

def total_sum_of_squares
#print("Total sum of squares: %.2f" % (test_raw[response], Y_predict))

[15.14329577 15.14329577 15.14329577 15.64       15.14329577 15.14329577
 15.14329577 15.14329577 15.0947619  15.0947619  15.0947619  15.0947619
 15.14329577 15.0947619  15.0947619  15.14329577 15.14329577 15.14329577
 15.14329577 15.0947619  15.14329577 15.14329577 15.14329577 15.14329577
 15.14329577 15.0947619  15.14329577 15.0947619  15.0947619  15.14329577
 15.0947619  15.14329577 14.75       15.0947619  15.14329577 15.14329577
 15.14329577 15.14329577 15.14329577 15.14329577 15.14329577 15.14329577
 15.14329577 15.0947619  14.95441176 15.14329577 15.14329577 15.14329577
 14.95441176 14.95441176 15.14329577 15.14329577 14.95441176 14.95441176
 15.14329577 15.14329577 15.14329577 15.14329577 14.95441176 15.14329577
 15.14329577 14.77       15.076      14.95441176 15.14329577 15.14329577
 15.14329577 15.14329577 15.14329577 15.14329577 15.14329577 14.95441176
 14.95441176 15.14329577 15.14329577 15.0947619  15.14329577 15.14329577
 15.14329577 14.95441176 15.14329577 15.14329577 15.