In [1]:
# allocate memory
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "16g") \
    .appName('my-cool-app') \
    .getOrCreate()

In [5]:
import findspark
findspark.init('/Users/jungleshen/spark/spark-3.0.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [6]:
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit

from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler, VectorSlicer, PCA
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier
from pyspark.ml.functions import vector_to_array
from pyspark.ml.clustering import KMeans
import re
from functools import reduce

from tqdm import tqdm
from pyspark.ml import Pipeline

In [8]:
train_df= spark.read.csv('train_features.csv', header=True, inferSchema=True)
target_df = spark.read.csv('train_targets_scored.csv', header=True, inferSchema=True)
test_df = spark.read.csv('test_features.csv', header=True, inferSchema=True)

In [9]:
train_df.cache()
target_df.cache()
test_df.cache()

DataFrame[sig_id: string, cp_type: string, cp_time: int, cp_dose: string, g-0: double, g-1: double, g-2: double, g-3: double, g-4: double, g-5: double, g-6: double, g-7: double, g-8: double, g-9: double, g-10: double, g-11: double, g-12: double, g-13: double, g-14: double, g-15: double, g-16: double, g-17: double, g-18: double, g-19: double, g-20: double, g-21: double, g-22: double, g-23: double, g-24: double, g-25: double, g-26: double, g-27: double, g-28: double, g-29: double, g-30: double, g-31: double, g-32: double, g-33: double, g-34: double, g-35: double, g-36: double, g-37: double, g-38: double, g-39: double, g-40: double, g-41: double, g-42: double, g-43: double, g-44: double, g-45: double, g-46: double, g-47: double, g-48: double, g-49: double, g-50: double, g-51: double, g-52: double, g-53: double, g-54: double, g-55: double, g-56: double, g-57: double, g-58: double, g-59: double, g-60: double, g-61: double, g-62: double, g-63: double, g-64: double, g-65: double, g-66: double

In [10]:
def encode_cat_features(df, cat_features):

  indexed_cols = [''.join([col_name, '_indexed']) for col_name in cat_features]
  encoded_cols = [''.join([col_name, '_encoded']) for col_name in cat_features]
  string_indexers = [StringIndexer(inputCol=cat_features[i], outputCol=indexed_cols[i]) for i in range(len(cat_features))]
    
  encoder = OneHotEncoder(inputCols=indexed_cols, outputCols=encoded_cols)
  
  pipline = Pipeline(stages=string_indexers + [encoder])
  
  encoded_df = pipline.fit(df).transform(df)
  encoded_df = encoded_df.drop(*indexed_cols + cat_features)

  return encoded_df

def normalize_features(df, cols, normalizer, output_cols, if_drop=True):
  """
  """
  normalizer_lst = []
  vectorized_cols = []
  vector_assembers = []
  
  if isinstance(cols, list):
    cols = {'cols': cols}
  
  if isinstance(output_cols, str):
    output_cols = {'cols': output_cols}
  
  for k, v in cols.items():
    
    temp_normalizer = normalizer.copy()
    vectorized_col = ''.join([output_cols[k], '_v'])
    vector_assember = VectorAssembler(inputCols=v, outputCol=vectorized_col)
    
    temp_normalizer.setInputCol(vectorized_col)
    temp_normalizer.setOutputCol(output_cols[k])
    
    normalizer_lst.append(temp_normalizer)
    vectorized_cols.append(vectorized_col)
    vector_assembers.append(vector_assember)
  
  pipline = Pipeline(stages=vector_assembers + normalizer_lst)
  normalized_df = pipline.fit(df).transform(df).drop(*vectorized_cols)
  
  if if_drop:
    
    for k, v in cols.items():
      
      normalized_df = normalized_df.drop(v)
  
  return normalized_df

def add_pca_features(df, g_cols, c_cols, k=40):
  
  ## normalize g-col and c-col
  std_scaler = StandardScaler(withMean=True)
  
  input_cols = {
    'g_cols': g_cols, 
    'c_cols': c_cols}
  
  output_cols = {
    'g_cols': 'g_normalized', 
    'c_cols': 'c_normalized'}
  
  normalized_df = normalize_features(df, input_cols, std_scaler, output_cols, if_drop=False)
  
  ## perform PCA on g-cols and c-cols
  g_col_pca = PCA(k=k, inputCol='g_normalized', outputCol='g_col_pca')
  c_col_pca = PCA(k=k, inputCol='c_normalized', outputCol='c_col_pca')
  
  pipeline = Pipeline(stages=[g_col_pca, c_col_pca])
  pca_df = pipeline.fit(normalized_df).transform(normalized_df)
  
  return pca_df
  
def add_stats_features(df, g_cols, c_cols):
  
  @udf('double')
  def cols_sum(*lst):

    return sum(lst)

  @udf('double')
  def cols_mean(*lst):

    n = len(lst)
    s = sum(lst)

    return s / n

  @udf('double')
  def cols_var(*lst):

    n = len(lst)
    s = sum(lst) / n
    total = 0

    for x in lst:

      total += (x - s)**2 

    return total / n
  
  @udf('double')
  def cols_min(*lst):
    
    return min(lst)
  
  @udf('double')
  def cols_max(*lst):
    
    return max(lst)
  
  stats_dict = {
    'min_stats': cols_min,
    'max_stats': cols_max,
    'var_stats': cols_var,
    'mean_stats': cols_mean,
    'sum_stats': cols_sum
  }
  
  for name, func in stats_dict.items():
    
    df = df.withColumn(''.join(['g_cols_', name]), func(*[col(g_col) for g_col in g_cols]))
    df = df.withColumn(''.join(['c_cols_', name]), func(*[col(c_col) for c_col in c_cols]))
  
  return df

def add_kmeans_features(df, g_cols, c_cols, k=2, num_iter=10):
  
  kmeans_g = KMeans(k=k, featuresCol=g_cols, predictionCol='g_col_k_mean', seed=16)
  kmeans_c = KMeans(k=k, featuresCol=c_cols, predictionCol='c_col_k_mean', seed=16)
  
  kmeans_df = kmeans_g.fit(df).transform(df)
  kmeans_df = kmeans_c.fit(kmeans_df).transform(kmeans_df)
  
  return kmeans_df

def feature_engineering(df, num_cluster=2, num_comp=40, num_iter=10):
  
  ## get g-col and c-col
  g_cols = list(filter(lambda v: re.match('g-.+', v), df.columns))
  c_cols = list(filter(lambda v: re.match('c-.+', v), df.columns))
  
  ## PCA
  pca_df = add_pca_features(df, g_cols, c_cols, num_comp)

  ## stats features on g and c cols
  stats_df = add_stats_features(pca_df, g_cols, c_cols)
  
  ## add k-means features
  kmeans_df = add_kmeans_features(stats_df, g_cols='g_normalized', c_cols='c_normalized', k=num_cluster, num_iter=num_iter)
  
  return kmeans_df

In [11]:
## add indicator column to both train and test so we can combine them later
train_df = train_df.withColumn('is_test', lit(0))
test_df = test_df.withColumn('is_test', lit(1))

## Combine train and test df
full_df = train_df.union(test_df)

In [12]:
## encode features
target_cols = ['cp_type', 'cp_dose']
encoded_df = encode_cat_features(full_df, target_cols)

In [13]:
## feature engineering
fe_df = feature_engineering(encoded_df, num_comp=20, num_iter=5)

## select all the feature columns

pca_cols = list(filter(lambda v: re.match('.+_pca', v), fe_df.columns))
stats_cols = list(filter(lambda v: re.match('.+_stats', v), fe_df.columns))
k_means_cols = list(filter(lambda v: re.match('.+_k_mean', v), fe_df.columns))
cat_cols = list(filter(lambda v: re.match('.+_encoded', v), fe_df.columns)) + ['cp_time']

## stack them to a single feature vector
vector_assember_train = VectorAssembler(inputCols=pca_cols + stats_cols + k_means_cols + cat_cols, outputCol='all_features')
fe_df = vector_assember_train.transform(fe_df)

## normalize all the features
normalizer = StandardScaler(withMean=True)
cols = ['all_features']
output_cols = 'features'
fe_df = normalize_features(fe_df, cols, normalizer, output_cols, if_drop=False)

## split train, test df
fe_train = fe_df.filter(fe_df['is_test'] == 0)
final_test = fe_df.filter(fe_df['is_test'] == 1).select(['sig_id', 'features'])

## join training target with training features
labels = target_df.drop('sig_id').columns
final_train = fe_train.join(target_df, ['sig_id']).select(*(['sig_id','features']+labels))

In [14]:
from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(final_train)
scaledData = scalerModel.transform(final_train)

In [15]:
# train validation split
(train, validation) = scaledData.randomSplit([0.8, 0.2], 16)

train.cache()
validation.cache()
train_df.unpersist()
test_df.unpersist()
target_df.unpersist()
fe_train.unpersist()
fe_df.unpersist()
final_train.unpersist()
encoded_df.unpersist()

DataFrame[sig_id: string, cp_time: int, g-0: double, g-1: double, g-2: double, g-3: double, g-4: double, g-5: double, g-6: double, g-7: double, g-8: double, g-9: double, g-10: double, g-11: double, g-12: double, g-13: double, g-14: double, g-15: double, g-16: double, g-17: double, g-18: double, g-19: double, g-20: double, g-21: double, g-22: double, g-23: double, g-24: double, g-25: double, g-26: double, g-27: double, g-28: double, g-29: double, g-30: double, g-31: double, g-32: double, g-33: double, g-34: double, g-35: double, g-36: double, g-37: double, g-38: double, g-39: double, g-40: double, g-41: double, g-42: double, g-43: double, g-44: double, g-45: double, g-46: double, g-47: double, g-48: double, g-49: double, g-50: double, g-51: double, g-52: double, g-53: double, g-54: double, g-55: double, g-56: double, g-57: double, g-58: double, g-59: double, g-60: double, g-61: double, g-62: double, g-63: double, g-64: double, g-65: double, g-66: double, g-67: double, g-68: double, g-69

In [16]:
# Multilabel Classifier
class MultiLabelClassifier:
    
    def __init__(self, clf, labels, feature_col,  
                 hyperparameters={}, 
                 predict_col=['probability','prediction'],
                 method=lambda prob_col, pred_col: float(pred_col if len(prob_col) == 1 else prob_col[1])):
        '''
        Initialize a multilabelclassifier
        clf: the model to use
        labels: a list of labels to predict
        feature_col: the feature column
        predict_col: the prediction column where the prediction is located
        hyperparameters: all optional hyperparameters that can tune
        method: a method of how to get the final prediction for one class
        '''
        self.clf = clf
        self.labels = labels
        self.feature_col = feature_col
        self.predict_col = predict_col
        self.hyperparameters = hyperparameters
        self.method = method
        self._trained_clfs = []

    def fit(self, train):
        self._trained_clfs = [self.clf(labelCol=label, featuresCol=self.feature_col, **self.hyperparameters)
                              .fit(train) 
                              for label in tqdm(self.labels)]
        return self

    def transform(self, x_test):
        get_predict = udf(self.method,FloatType())
        select_cols = [self.feature_col, 'sig_id'] # need to change if have time
        res = x_test.select(*select_cols)
        for i, clf in tqdm(enumerate(self._trained_clfs)):
            res = clf.transform(res)
            res = res.withColumn(self.labels[i], get_predict(*self.predict_col))
            select_cols.append(self.labels[i])
            res = res.select(*select_cols)
        res = res.select(*select_cols[1:])
        return res
    
# Evaluation
def score(y, y_hat, join_id):
    import math
    target_cols = y.drop(join_id).columns
    @udf('double')
    def loss(t, p):
        cut = 10**(-15)
        p = max(min(p, 1-cut),cut)
        return t * math.log(p) + (1 - t) * math.log(1 - p)
    #rename columns
    for c in target_cols:
        y_hat = y_hat.withColumnRenamed(c, c+'_y_hat')
    #inner join table on join_id
    df = y_hat.join(y, join_id, 'inner')
    name = target_cols[0]
    df = df.withColumn('log_loss', loss(name, name+'_y_hat'))
    for name in target_cols[1:]:
        df = df.withColumn('log_loss', col('log_loss')+loss(name, name+'_y_hat'))
    df = df.withColumn('log_loss', col('log_loss')/len(target_cols))
    res = df.select((-_mean(col('log_loss'))).alias('score'))
    return res    

In [17]:
def convert_to_array(prob_col, pred_col):
            ## solve na problem, if len(prob_col) == 1, we use the prediction col
            converted_prob_col = prob_col.toArray().tolist()
            
            if len(converted_prob_col) == 1:
                
                return pred_col
            
            else:
                
                return converted_prob_col[1]

In [19]:
#training NB
hyperparameters = {'smoothing': 1,
                  'modelType':"multinomial"}
#method = convert_to_array
labels = target_df.drop('sig_id').columns
clf = MultiLabelClassifier(NaiveBayes, labels, 'scaledFeatures', hyperparameters = hyperparameters).fit(train)


  0%|          | 0/206 [00:00<?, ?it/s][A
  0%|          | 1/206 [00:02<07:25,  2.17s/it][A
  1%|          | 2/206 [00:04<07:15,  2.14s/it][A
  1%|▏         | 3/206 [00:06<07:07,  2.11s/it][A
  2%|▏         | 4/206 [00:08<07:05,  2.11s/it][A
  2%|▏         | 5/206 [00:10<07:03,  2.11s/it][A
  3%|▎         | 6/206 [00:12<07:06,  2.13s/it][A
  3%|▎         | 7/206 [00:14<07:05,  2.14s/it][A
  4%|▍         | 8/206 [00:17<07:08,  2.17s/it][A
  4%|▍         | 9/206 [00:19<07:14,  2.20s/it][A
  5%|▍         | 10/206 [00:21<07:23,  2.26s/it][A
  5%|▌         | 11/206 [00:23<07:21,  2.26s/it][A
  6%|▌         | 12/206 [00:26<07:15,  2.25s/it][A
  6%|▋         | 13/206 [00:28<07:22,  2.29s/it][A
  7%|▋         | 14/206 [00:30<07:17,  2.28s/it][A
  7%|▋         | 15/206 [00:33<07:21,  2.31s/it][A
  8%|▊         | 16/206 [00:35<07:22,  2.33s/it][A
  8%|▊         | 17/206 [00:38<07:23,  2.35s/it][A
  9%|▊         | 18/206 [00:40<07:19,  2.34s/it][A
  9%|▉         | 19/206 [00:4

In [20]:
#predict NB
res_train = clf.transform(train)


0it [00:00, ?it/s][A
1it [00:00,  3.56it/s][A
2it [00:00,  3.74it/s][A
3it [00:00,  3.93it/s][A
4it [00:00,  3.98it/s][A
5it [00:01,  4.08it/s][A
6it [00:01,  4.14it/s][A
7it [00:01,  4.23it/s][A
8it [00:01,  4.23it/s][A
9it [00:02,  4.31it/s][A
10it [00:02,  4.38it/s][A
11it [00:02,  4.50it/s][A
12it [00:02,  4.56it/s][A
13it [00:02,  4.56it/s][A
14it [00:03,  4.63it/s][A
15it [00:03,  4.67it/s][A
16it [00:03,  4.64it/s][A
17it [00:03,  4.66it/s][A
18it [00:04,  4.59it/s][A
19it [00:04,  4.61it/s][A
20it [00:04,  4.59it/s][A
21it [00:04,  4.53it/s][A
22it [00:04,  4.54it/s][A
23it [00:05,  4.53it/s][A
24it [00:05,  4.47it/s][A
25it [00:05,  4.48it/s][A
26it [00:05,  4.41it/s][A
27it [00:06,  4.38it/s][A
28it [00:06,  4.33it/s][A
29it [00:06,  4.26it/s][A
30it [00:06,  4.27it/s][A
31it [00:07,  4.21it/s][A
32it [00:07,  4.16it/s][A
33it [00:07,  4.15it/s][A
34it [00:07,  4.14it/s][A
35it [00:08,  4.05it/s][A
36it [00:08,  4.02it/s][A
37it [00:08,  

In [21]:
#evaluate NB
import time
print('Start calculating')
sample_id = "sig_id"
start = time.time()
y = target_df.select(*[sample_id]+labels)
score(y, res_train, sample_id).show()
print('Calculation finished with time:', time.time() - start)

Start calculating
+-------------------+
|              score|
+-------------------+
|0.02061951275572008|
+-------------------+

Calculation finished with time: 143.513854265213


In [None]:
start = time.time()
res_valid = clf.transform(validation)
score(y, res_valid, sample_id).show()
print('Calculation finished with time:', time.time() - start)

In [22]:
#training DT
clf2 = MultiLabelClassifier(DecisionTreeClassifier, labels, 'features').fit(train)


  0%|          | 0/206 [00:00<?, ?it/s][A
  0%|          | 1/206 [00:06<21:45,  6.37s/it][A
  1%|          | 2/206 [00:14<23:47,  7.00s/it][A
  1%|▏         | 3/206 [00:17<19:14,  5.69s/it][A
  2%|▏         | 4/206 [00:19<15:24,  4.58s/it][A
  2%|▏         | 5/206 [00:21<12:42,  3.79s/it][A
  3%|▎         | 6/206 [00:23<10:43,  3.22s/it][A
  3%|▎         | 7/206 [00:25<09:23,  2.83s/it][A
  4%|▍         | 8/206 [00:27<08:28,  2.57s/it][A
  4%|▍         | 9/206 [00:29<07:53,  2.40s/it][A
  5%|▍         | 10/206 [00:31<07:21,  2.25s/it][A
  5%|▌         | 11/206 [00:32<06:58,  2.15s/it][A
  6%|▌         | 12/206 [00:34<06:41,  2.07s/it][A
  6%|▋         | 13/206 [00:36<06:31,  2.03s/it][A
  7%|▋         | 14/206 [00:38<06:21,  1.99s/it][A
  7%|▋         | 15/206 [00:40<06:15,  1.97s/it][A
  8%|▊         | 16/206 [00:42<06:09,  1.94s/it][A
  8%|▊         | 17/206 [00:44<06:08,  1.95s/it][A
  9%|▊         | 18/206 [00:46<06:04,  1.94s/it][A
  9%|▉         | 19/206 [00:4

In [23]:
#predict DT
res2 = clf2.transform(train)


0it [00:00, ?it/s][A
1it [00:00,  3.79it/s][A
2it [00:00,  4.17it/s][A
3it [00:00,  4.46it/s][A
4it [00:00,  4.70it/s][A
5it [00:01,  4.78it/s][A
6it [00:01,  4.87it/s][A
7it [00:01,  4.93it/s][A
8it [00:01,  5.03it/s][A
9it [00:01,  5.07it/s][A
10it [00:01,  5.10it/s][A
11it [00:02,  5.10it/s][A
12it [00:02,  5.07it/s][A
13it [00:02,  5.10it/s][A
14it [00:02,  5.08it/s][A
15it [00:02,  5.02it/s][A
16it [00:03,  4.94it/s][A
17it [00:03,  4.93it/s][A
18it [00:03,  4.87it/s][A
19it [00:03,  4.82it/s][A
20it [00:04,  4.82it/s][A
21it [00:04,  4.78it/s][A
22it [00:04,  4.71it/s][A
23it [00:04,  4.67it/s][A
24it [00:04,  4.59it/s][A
25it [00:05,  4.53it/s][A
26it [00:05,  4.42it/s][A
27it [00:05,  4.33it/s][A
28it [00:05,  4.32it/s][A
29it [00:06,  4.25it/s][A
30it [00:06,  4.23it/s][A
31it [00:06,  4.23it/s][A
32it [00:06,  4.20it/s][A
33it [00:07,  4.19it/s][A
34it [00:07,  4.05it/s][A
35it [00:07,  4.04it/s][A
36it [00:07,  4.05it/s][A
37it [00:08,  

In [24]:
#evaluate DT
print('Start calculating')
start = time.time()
score(y, res2, sample_id).show()
print('Calculation finished with time:', time.time() - start)

Start calculating
+--------------------+
|               score|
+--------------------+
|0.015250109597208187|
+--------------------+

Calculation finished with time: 122.47417306900024
