In [71]:
from sklearn import preprocessing, linear_model, metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [72]:
#train_data = pd.read_csv("../../data/train_data.tsv", sep="\t")
#test_data = pd.read_csv("../../data/test_data.tsv", sep="\t")
data = pd.read_csv("../../data/regression_data.tsv", sep="\t")

In [73]:
encoder = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist')

#rotate this between the different drug sensitivities... do i have to do all 400+??? 
#how large are these models to store? could i still deploy them?
response = 'zebularine_AUC'

#encoding the features: leave the response variable out, just preprocessing the features
relevant_data = data[['site', 'histology', response]].dropna()
print(relevant_data.shape)

#separate (raw) predictors and response variables
X = np.array(relevant_data[['site', 'histology']])
y = np.array(relevant_data[response])

#split data into train and test
train_X_raw, test_X_raw, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)  

#fit encoder to categories of the train dataset only
encoder.fit(train_X_raw)

#transform train and test data sets, introducing dummy features and new values
train_X_enc = encoder.transform(train_X_raw).toarray()
test_X_enc = encoder.transform(test_X_raw).toarray()
print(train_X_enc.shape)
print(encoder.categories_)

(631, 3)
(504, 41)
[array(['autonomic_ganglia', 'biliary_tract', 'bone', 'breast',
       'central_nervous_system', 'endometrium',
       'haematopoietic_and_lymphoid_tissue', 'kidney', 'large_intestine',
       'liver', 'lung', 'oesophagus', 'ovary', 'pancreas', 'pleura',
       'prostate', 'salivary_gland', 'skin', 'soft_tissue', 'stomach',
       'thyroid', 'upper_aerodigestive_tract', 'urinary_tract'],
      dtype=object), array(['Ewings_sarcoma_peripheral_primitive_neuroectodermal_tumor',
       'carcinoid_endocrine_tumor', 'carcinoma', 'chondrosarcoma',
       'glioma', 'haematopoietic_neoplasm', 'leiomyosarcoma',
       'lymphoid_neoplasm',
       'malignant_fibrous_histiocytoma_pleomorphic_sarcoma',
       'malignant_melanoma', 'mesothelioma', 'neuroblastoma',
       'osteosarcoma', 'primitive_neuroectodermal_tumor_medulloblastoma',
       'rhabdoid_tumor', 'rhabdomyosarcoma', 'sarcoma',
       'sex_cord_stromal_tumor'], dtype=object)]


In [75]:
#example regression: use one of the drug AUCs as the response
model = linear_model.LinearRegression()
model.fit(train_X_enc, train_y)

y_predict = model.predict(test_X_enc)
print(y_predict)
# The coefficients
#print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % metrics.mean_squared_error(test_y, y_predict))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % metrics.r2_score(test_y, y_predict))

[ 1.45156250e+01  1.45156250e+01  1.44843750e+01  1.47500000e+01
  1.43984375e+01  1.43203125e+01  1.48437500e+01  1.47500000e+01
  1.43203125e+01  1.44375000e+01  1.41796875e+01  1.43203125e+01
  1.44843750e+01  1.45156250e+01  1.43984375e+01  1.41406250e+01
  1.41796875e+01  1.45156250e+01  1.42089844e+01  1.43984375e+01
  1.46718750e+01  1.44375000e+01  1.44375000e+01  1.45312500e+01
  1.43203125e+01  1.46347656e+01  1.46718750e+01  1.46718750e+01
  1.41796875e+01  1.43984375e+01  1.44375000e+01  1.40937500e+01
  1.46347656e+01  1.43984375e+01  1.43984375e+01  1.46347656e+01
  1.43203125e+01  1.48515625e+01  1.45312500e+01  1.43203125e+01
  1.43203125e+01  1.44375000e+01  1.43203125e+01  1.43984375e+01
  1.43203125e+01  1.43203125e+01  1.46347656e+01  1.40937500e+01
  1.44375000e+01  1.44062500e+01  1.45312500e+01  1.44375000e+01
  1.33828125e+01  1.41406250e+01  1.43203125e+01  1.44062500e+01
  1.45312500e+01  1.46718750e+01  1.44062500e+01  1.27500000e+01
  1.43203125e+01  1.44375