In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "16g") \
    .appName('my-cool-app') \
    .getOrCreate()

In [1]:
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit
sc = SparkContext()
spark = SparkSession(sc)

In [2]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler, VectorSlicer, PCA
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.functions import vector_to_array
from pyspark.ml.clustering import KMeans
import re
from functools import reduce

In [3]:
# Functions for Feature Engineering
def encode_cat_features(df, cat_features):

  indexed_cols = [''.join([col_name, '_indexed']) for col_name in cat_features]
  encoded_cols = [''.join([col_name, '_encoded']) for col_name in cat_features]
  string_indexers = [StringIndexer(inputCol=cat_features[i], outputCol=indexed_cols[i]) for i in range(len(cat_features))]
    
  encoder = OneHotEncoder(inputCols=indexed_cols, outputCols=encoded_cols)
  
  pipline = Pipeline(stages=string_indexers + [encoder])
  
  encoded_df = pipline.fit(df).transform(df)
  encoded_df = encoded_df.drop(*indexed_cols + cat_features)

  return encoded_df

def normalize_features(df, cols, normalizer, output_cols, if_drop=True):
  """
  """
  normalizer_lst = []
  vectorized_cols = []
  vector_assembers = []
  
  if isinstance(cols, list):
    cols = {'cols': cols}
  
  if isinstance(output_cols, str):
    output_cols = {'cols': output_cols}
  
  for k, v in cols.items():
    
    temp_normalizer = normalizer.copy()
    vectorized_col = ''.join([output_cols[k], '_v'])
    vector_assember = VectorAssembler(inputCols=v, outputCol=vectorized_col)
    
    temp_normalizer.setInputCol(vectorized_col)
    temp_normalizer.setOutputCol(output_cols[k])
    
    normalizer_lst.append(temp_normalizer)
    vectorized_cols.append(vectorized_col)
    vector_assembers.append(vector_assember)
  
  pipline = Pipeline(stages=vector_assembers + normalizer_lst)
  normalized_df = pipline.fit(df).transform(df).drop(*vectorized_cols)
  
  if if_drop:
    
    for k, v in cols.items():
      
      normalized_df = normalized_df.drop(v)
  
  return normalized_df

def add_pca_features(df, g_cols, c_cols, k=40):
  
  ## normalize g-col and c-col
  std_scaler = StandardScaler(withMean=True)
  
  input_cols = {
    'g_cols': g_cols, 
    'c_cols': c_cols}
  
  output_cols = {
    'g_cols': 'g_normalized', 
    'c_cols': 'c_normalized'}
  
  normalized_df = normalize_features(df, input_cols, std_scaler, output_cols, if_drop=False)
  
  ## perform PCA on g-cols and c-cols
  g_col_pca = PCA(k=k, inputCol='g_normalized', outputCol='g_col_pca')
  c_col_pca = PCA(k=k, inputCol='c_normalized', outputCol='c_col_pca')
  
  pipeline = Pipeline(stages=[g_col_pca, c_col_pca])
  pca_df = pipeline.fit(normalized_df).transform(normalized_df)
  
  return pca_df
  
def add_stats_features(df, g_cols, c_cols):
  
  @udf('double')
  def cols_sum(*lst):

    return sum(lst)

  @udf('double')
  def cols_mean(*lst):

    n = len(lst)
    s = sum(lst)

    return s / n

  @udf('double')
  def cols_var(*lst):

    n = len(lst)
    s = sum(lst) / n
    total = 0

    for x in lst:

      total += (x - s)**2 

    return total / n
  
  @udf('double')
  def cols_min(*lst):
    
    return min(lst)
  
  @udf('double')
  def cols_max(*lst):
    
    return max(lst)
  
  stats_dict = {
    'min_stats': cols_min,
    'max_stats': cols_max,
    'var_stats': cols_var,
    'mean_stats': cols_mean,
    'sum_stats': cols_sum
  }
  
  for name, func in stats_dict.items():
    
    df = df.withColumn(''.join(['g_cols_', name]), func(*[col(g_col) for g_col in g_cols]))
    df = df.withColumn(''.join(['c_cols_', name]), func(*[col(c_col) for c_col in c_cols]))
  
  return df

def add_kmeans_features(df, g_cols, c_cols, k=2, num_iter=10):
  
  kmeans_g = KMeans(k=k, featuresCol=g_cols, predictionCol='g_col_k_mean', seed=16)
  kmeans_c = KMeans(k=k, featuresCol=c_cols, predictionCol='c_col_k_mean', seed=16)
  
  kmeans_df = kmeans_g.fit(df).transform(df)
  kmeans_df = kmeans_c.fit(kmeans_df).transform(kmeans_df)
  
  return kmeans_df

def feature_engineering(df, num_cluster=2, num_comp=40, num_iter=10):
  
  ## get g-col and c-col
  g_cols = list(filter(lambda v: re.match('g-.+', v), df.columns))
  c_cols = list(filter(lambda v: re.match('c-.+', v), df.columns))
  
  ## PCA
  pca_df = add_pca_features(df, g_cols, c_cols, num_comp)

  ## stats features on g and c cols
  stats_df = add_stats_features(pca_df, g_cols, c_cols)
  
  ## add k-means features
  kmeans_df = add_kmeans_features(stats_df, g_cols='g_normalized', c_cols='c_normalized', k=num_cluster, num_iter=num_iter)
  
  return kmeans_df

In [4]:
from tqdm import tqdm
# Multilabel Classifier
class MultiLabelClassifier:
    
    def __init__(self, clf, labels, feature_col,  
                 hyperparameters={}, 
                 predict_col=['probability','prediction'],
                 method=lambda prob_col, pred_col: float(pred_col if len(prob_col) == 1 else prob_col[1])):
        '''
        Initialize a multilabelclassifier
        clf: the model to use
        labels: a list of labels to predict
        feature_col: the feature column
        predict_col: the prediction column where the prediction is located
        hyperparameters: all optional hyperparameters that can tune
        method: a method of how to get the final prediction for one class
        '''
        self.clf = clf
        self.labels = labels
        self.feature_col = feature_col
        self.predict_col = predict_col
        self.hyperparameters = hyperparameters
        self.method = method
        self._trained_clfs = []

    def fit(self, train):
        self._trained_clfs = [self.clf(labelCol=label, featuresCol=self.feature_col, **self.hyperparameters)
                              .fit(train) 
                              for label in tqdm(self.labels)]
        return self

    def transform(self, x_test):
        get_predict = udf(self.method,FloatType())
        select_cols = [self.feature_col, 'sig_id'] # need to change if have time
        res = x_test.select(*select_cols)
        for i, clf in tqdm(enumerate(self._trained_clfs)):
            res = clf.transform(res)
            res = res.withColumn(self.labels[i], get_predict(*self.predict_col))
            select_cols.append(self.labels[i])
            res = res.select(*select_cols)
        res = res.select(*select_cols[1:])
        return res

In [5]:
# Evaluation
def score(y, y_hat, join_id):
    import math
    target_cols = y.drop(join_id).columns
    @udf('double')
    def loss(t, p):
        cut = 10**(-15)
        p = max(min(p, 1-cut),cut)
        return t * math.log(p) + (1 - t) * math.log(1 - p)
    #rename columns
    for c in target_cols:
        y_hat = y_hat.withColumnRenamed(c, c+'_y_hat')
    #inner join table on join_id
    df = y_hat.join(y, join_id, 'inner')
    name = target_cols[0]
    df = df.withColumn('log_loss', loss(name, name+'_y_hat'))
    for name in target_cols[1:]:
        df = df.withColumn('log_loss', col('log_loss')+loss(name, name+'_y_hat'))
    df = df.withColumn('log_loss', col('log_loss')/len(target_cols))
    res = df.select((-_mean(col('log_loss'))).alias('score'))
    return res

In [6]:
#file_path = '/FileStore/tables/'
file_path = './'
sample_id = 'sig_id'
train_df = spark.read.csv(file_path+'train_features.csv', header=True, inferSchema=True)
target_df = spark.read.csv(file_path+'train_targets_scored.csv', header=True, inferSchema=True)
train_drug_df = spark.read.csv(file_path+'train_drug.csv', header=True, inferSchema=True)
target_nonscored_df = spark.read.csv(file_path+'train_targets_nonscored.csv', header=True, inferSchema=True)
test_df = spark.read.csv(file_path+'test_features.csv', header=True, inferSchema=True)


In [7]:
train_df.cache()
target_df.cache()
test_df.cache()

DataFrame[sig_id: string, cp_type: string, cp_time: int, cp_dose: string, g-0: double, g-1: double, g-2: double, g-3: double, g-4: double, g-5: double, g-6: double, g-7: double, g-8: double, g-9: double, g-10: double, g-11: double, g-12: double, g-13: double, g-14: double, g-15: double, g-16: double, g-17: double, g-18: double, g-19: double, g-20: double, g-21: double, g-22: double, g-23: double, g-24: double, g-25: double, g-26: double, g-27: double, g-28: double, g-29: double, g-30: double, g-31: double, g-32: double, g-33: double, g-34: double, g-35: double, g-36: double, g-37: double, g-38: double, g-39: double, g-40: double, g-41: double, g-42: double, g-43: double, g-44: double, g-45: double, g-46: double, g-47: double, g-48: double, g-49: double, g-50: double, g-51: double, g-52: double, g-53: double, g-54: double, g-55: double, g-56: double, g-57: double, g-58: double, g-59: double, g-60: double, g-61: double, g-62: double, g-63: double, g-64: double, g-65: double, g-66: double

In [8]:
## add indicator column to both train and test so we can combine them later
train_df = train_df.withColumn('is_test', lit(0))
test_df = test_df.withColumn('is_test', lit(1))

## Combine train and test df
full_df = train_df.union(test_df)

In [9]:
## encode features
target_cols = ['cp_type', 'cp_dose']
encoded_df = encode_cat_features(full_df, target_cols)

In [10]:
## feature engineering
fe_df = feature_engineering(encoded_df, num_comp=20, num_iter=5)

## select all the feature columns

pca_cols = list(filter(lambda v: re.match('.+_pca', v), fe_df.columns))
stats_cols = list(filter(lambda v: re.match('.+_stats', v), fe_df.columns))
k_means_cols = list(filter(lambda v: re.match('.+_k_mean', v), fe_df.columns))
cat_cols = list(filter(lambda v: re.match('.+_encoded', v), fe_df.columns)) + ['cp_time']

## stack them to a single feature vector
vector_assember_train = VectorAssembler(inputCols=pca_cols + stats_cols + k_means_cols + cat_cols, outputCol='all_features')
fe_df = vector_assember_train.transform(fe_df)

## normalize all the features
normalizer = StandardScaler(withMean=True)
cols = ['all_features']
output_cols = 'features'
fe_df = normalize_features(fe_df, cols, normalizer, output_cols, if_drop=False)

## split train, test df
fe_train = fe_df.filter(fe_df['is_test'] == 0)
final_test = fe_df.filter(fe_df['is_test'] == 1).select(['sig_id', 'features'])

## join training target with training features
labels = target_df.drop('sig_id').columns
final_train = fe_train.join(target_df, ['sig_id']).select(*(['sig_id','features']+labels))

#### 

In [11]:
"""
This is just for test. you can delete this after optimize feature engineering
"""

# preprocess data
train_feature_df = train_df.drop(*['cp_type', 'cp_time','cp_dose'])

# assemble features
assemble_cols = train_feature_df.drop(sample_id).columns
vector_assember_train = VectorAssembler(inputCols=assemble_cols, outputCol='features')
train_feature_df = vector_assember_train.transform(train_feature_df).select(sample_id, 'features')
train_feature_df = train_feature_df.select(sample_id, 'features')

# join features with all targets
final_train = train_feature_df.join(target_df, sample_id, 'inner')


In [12]:
## train test split
(train, validation) = final_train.randomSplit([0.8, 0.2], 16)

train.cache()
validation.cache()

DataFrame[sig_id: string, features: vector, 5-alpha_reductase_inhibitor: int, 11-beta-hsd1_inhibitor: int, acat_inhibitor: int, acetylcholine_receptor_agonist: int, acetylcholine_receptor_antagonist: int, acetylcholinesterase_inhibitor: int, adenosine_receptor_agonist: int, adenosine_receptor_antagonist: int, adenylyl_cyclase_activator: int, adrenergic_receptor_agonist: int, adrenergic_receptor_antagonist: int, akt_inhibitor: int, aldehyde_dehydrogenase_inhibitor: int, alk_inhibitor: int, ampk_activator: int, analgesic: int, androgen_receptor_agonist: int, androgen_receptor_antagonist: int, anesthetic_-_local: int, angiogenesis_inhibitor: int, angiotensin_receptor_antagonist: int, anti-inflammatory: int, antiarrhythmic: int, antibiotic: int, anticonvulsant: int, antifungal: int, antihistamine: int, antimalarial: int, antioxidant: int, antiprotozoal: int, antiviral: int, apoptosis_stimulant: int, aromatase_inhibitor: int, atm_kinase_inhibitor: int, atp-sensitive_potassium_channel_antago

In [13]:
# Train NaiveBayes
from pyspark.ml.classification import DecisionTreeClassifier, NaiveBayes
import time
labels = target_df.drop(sample_id).columns

print('Start training')
start = time.time()
clf = MultiLabelClassifier(NaiveBayes, labels, 'features', hyperparameters={'modelType':'gaussian'}).fit(train)
print('Train finished with time:', time.time() - start)


  0%|          | 0/206 [00:00<?, ?it/s]

Start training


100%|██████████| 206/206 [05:39<00:00,  1.65s/it]

Train finished with time: 339.50728964805603





In [14]:
# Evaluate the NaiveBayes model
print('Start training prediction')
start = time.time()
train_pred = clf.transform(train)
print('Training prediction finished! time:', time.time() - start)


print('Start validation prediction')
start = time.time()
validation_pred = clf.transform(validation)
print('Validation prediction finished! time:', time.time() - start)

2it [00:00, 17.01it/s]

Start training prediction


206it [01:20,  2.56it/s]
3it [00:00, 21.33it/s]

Training prediction finished! time: 80.90630388259888
Start validation prediction


206it [01:23,  2.46it/s]


Validation prediction finished! time: 84.10104489326477


In [15]:
y = target_df.select(*[sample_id]+labels)
score(y, train_pred, sample_id).show()

+------------------+
|             score|
+------------------+
|15.863481433201864|
+------------------+



In [20]:
# Train Decision Tree
import time
labels = target_df.drop(sample_id).columns

print('Start training')
start = time.time()
clf = MultiLabelClassifier(DecisionTreeClassifier, labels, 'features').fit(train)
print('Train finished with time:', time.time() - start)





  0%|          | 0/206 [00:00<?, ?it/s][A[A[A

Start training





  0%|          | 1/206 [00:12<41:35, 12.17s/it][A[A[A


  1%|          | 2/206 [00:22<39:32, 11.63s/it][A[A[A


  1%|▏         | 3/206 [00:33<38:15, 11.31s/it][A[A[A


  2%|▏         | 4/206 [00:43<37:35, 11.17s/it][A[A[A


  2%|▏         | 5/206 [00:55<37:21, 11.15s/it][A[A[A


  3%|▎         | 6/206 [01:05<36:45, 11.03s/it][A[A[A


  3%|▎         | 7/206 [01:16<36:39, 11.05s/it][A[A[A


  4%|▍         | 8/206 [01:27<35:58, 10.90s/it][A[A[A


  4%|▍         | 9/206 [01:38<35:30, 10.82s/it][A[A[A


  9%|▉         | 19/206 [18:52:16<185:43:54, 3575.59s/it]
  5%|▌         | 11/206 [17:52:37<316:54:46, 5850.70s/it]



  5%|▌         | 11/206 [01:59<34:41, 10.67s/it][A[A[A


  6%|▌         | 12/206 [02:08<33:21, 10.32s/it][A[A[A


  6%|▋         | 13/206 [02:18<32:44, 10.18s/it][A[A[A


  7%|▋         | 14/206 [02:28<32:19, 10.10s/it][A[A[A


  7%|▋         | 15/206 [02:38<31:42,  9.96s/it][A[A[A


  8%|▊         | 16/206 [02:48<31:40, 10.00s/it

Train finished with time: 2130.5782680511475





In [21]:
# Evaluate the decision tree model
print('Start training prediction')
start = time.time()
train_pred = clf.transform(train)
print('Training prediction finished! time:', time.time() - start)


print('Start validation prediction')
start = time.time()
validation_pred = clf.transform(validation)
print('Validation prediction finished! time:', time.time() - start)

2it [00:00, 13.59it/s]

Start training prediction


206it [01:19,  2.58it/s]
2it [00:00, 19.01it/s]

Training prediction finished! time: 80.20718717575073
Start validation prediction


206it [01:16,  2.69it/s]


Validation prediction finished! time: 76.89807796478271


In [22]:
y = target_df.select(*[sample_id]+labels)
score(y, train_pred, sample_id).show()

+--------------------+
|               score|
+--------------------+
|0.013922339833575869|
+--------------------+

