In [6]:
import os    
import findspark
import re
from functools import reduce
from tqdm import tqdm

os.environ['SPARK_HOME'] = 'C:\\Users\\msi\\Desktop\\spark\\spark-3.0.1-bin-hadoop3.2'
findspark.init()
exec(open(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.0.1
      /_/

Using Python version 3.8.3 (default, Jul  2 2020 17:30:36)
SparkSession available as 'spark'.


In [128]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler, VectorSlicer, PCA
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col, lit, udf, mean as _mean
from pyspark.ml.clustering import KMeans

In [8]:
train_df= spark.read.csv('C:\\Users\\msi\\Onedrive\\MOA\\train_features.csv', header=True, inferSchema=True)
target_df = spark.read.csv('C:\\Users\\msi\\Onedrive\\MOA\\train_targets_scored.csv', header=True, inferSchema=True)
test_df = spark.read.csv('C:\\Users\\msi\\Onedrive\\MOA\\test_features.csv', header=True, inferSchema=True)

In [9]:
train_df.cache()
target_df.cache()
test_df.cache()

DataFrame[sig_id: string, cp_type: string, cp_time: int, cp_dose: string, g-0: double, g-1: double, g-2: double, g-3: double, g-4: double, g-5: double, g-6: double, g-7: double, g-8: double, g-9: double, g-10: double, g-11: double, g-12: double, g-13: double, g-14: double, g-15: double, g-16: double, g-17: double, g-18: double, g-19: double, g-20: double, g-21: double, g-22: double, g-23: double, g-24: double, g-25: double, g-26: double, g-27: double, g-28: double, g-29: double, g-30: double, g-31: double, g-32: double, g-33: double, g-34: double, g-35: double, g-36: double, g-37: double, g-38: double, g-39: double, g-40: double, g-41: double, g-42: double, g-43: double, g-44: double, g-45: double, g-46: double, g-47: double, g-48: double, g-49: double, g-50: double, g-51: double, g-52: double, g-53: double, g-54: double, g-55: double, g-56: double, g-57: double, g-58: double, g-59: double, g-60: double, g-61: double, g-62: double, g-63: double, g-64: double, g-65: double, g-66: double

In [10]:
def encode_cat_features(df, cat_features):

  indexed_cols = [''.join([col_name, '_indexed']) for col_name in cat_features]
  encoded_cols = [''.join([col_name, '_encoded']) for col_name in cat_features]
  string_indexers = [StringIndexer(inputCol=cat_features[i], outputCol=indexed_cols[i]) for i in range(len(cat_features))]
    
  encoder = OneHotEncoder(inputCols=indexed_cols, outputCols=encoded_cols)
  
  pipline = Pipeline(stages=string_indexers + [encoder])
  
  encoded_df = pipline.fit(df).transform(df)
  encoded_df = encoded_df.drop(*indexed_cols + cat_features)

  return encoded_df

def normalize_features(df, cols, normalizer, output_cols, if_drop=True):
  """
  """
  normalizer_lst = []
  vectorized_cols = []
  vector_assembers = []
  
  if isinstance(cols, list):
    cols = {'cols': cols}
  
  if isinstance(output_cols, str):
    output_cols = {'cols': output_cols}
  
  for k, v in cols.items():
    
    temp_normalizer = normalizer.copy()
    vectorized_col = ''.join([output_cols[k], '_v'])
    vector_assember = VectorAssembler(inputCols=v, outputCol=vectorized_col)
    
    temp_normalizer.setInputCol(vectorized_col)
    temp_normalizer.setOutputCol(output_cols[k])
    
    normalizer_lst.append(temp_normalizer)
    vectorized_cols.append(vectorized_col)
    vector_assembers.append(vector_assember)
  
  pipline = Pipeline(stages=vector_assembers + normalizer_lst)
  normalized_df = pipline.fit(df).transform(df).drop(*vectorized_cols)
  
  if if_drop:
    
    for k, v in cols.items():
      
      normalized_df = normalized_df.drop(v)
  
  return normalized_df

def add_pca_features(df, g_cols, c_cols, k=40):
  
  ## normalize g-col and c-col
  std_scaler = StandardScaler(withMean=True)
  
  input_cols = {
    'g_cols': g_cols, 
    'c_cols': c_cols}
  
  output_cols = {
    'g_cols': 'g_normalized', 
    'c_cols': 'c_normalized'}
  
  normalized_df = normalize_features(df, input_cols, std_scaler, output_cols, if_drop=False)
  
  ## perform PCA on g-cols and c-cols
  g_col_pca = PCA(k=k, inputCol='g_normalized', outputCol='g_col_pca')
  c_col_pca = PCA(k=k, inputCol='c_normalized', outputCol='c_col_pca')
  
  pipeline = Pipeline(stages=[g_col_pca, c_col_pca])
  pca_df = pipeline.fit(normalized_df).transform(normalized_df)
  
  return pca_df
  
def add_stats_features(df, g_cols, c_cols):
  
  @udf('double')
  def cols_sum(*lst):

    return sum(lst)

  @udf('double')
  def cols_mean(*lst):

    n = len(lst)
    s = sum(lst)

    return s / n

  @udf('double')
  def cols_var(*lst):

    n = len(lst)
    s = sum(lst) / n
    total = 0

    for x in lst:

      total += (x - s)**2 

    return total / n
  
  @udf('double')
  def cols_min(*lst):
    
    return min(lst)
  
  @udf('double')
  def cols_max(*lst):
    
    return max(lst)
  
  stats_dict = {
    'min_stats': cols_min,
    'max_stats': cols_max,
    'var_stats': cols_var,
    'mean_stats': cols_mean,
    'sum_stats': cols_sum
  }
  
  for name, func in stats_dict.items():
    
    df = df.withColumn(''.join(['g_cols_', name]), func(*[col(g_col) for g_col in g_cols]))
    df = df.withColumn(''.join(['c_cols_', name]), func(*[col(c_col) for c_col in c_cols]))
  
  return df

def add_kmeans_features(df, g_cols, c_cols, k=2, num_iter=10):
  
  kmeans_g = KMeans(k=k, featuresCol=g_cols, predictionCol='g_col_k_mean', seed=16)
  kmeans_c = KMeans(k=k, featuresCol=c_cols, predictionCol='c_col_k_mean', seed=16)
  
  kmeans_df = kmeans_g.fit(df).transform(df)
  kmeans_df = kmeans_c.fit(kmeans_df).transform(kmeans_df)
  
  return kmeans_df

def feature_engineering(df, num_cluster=2, num_comp=40, num_iter=10):
  
  ## get g-col and c-col
  g_cols = list(filter(lambda v: re.match('g-.+', v), df.columns))
  c_cols = list(filter(lambda v: re.match('c-.+', v), df.columns))
  
  ## PCA
  pca_df = add_pca_features(df, g_cols, c_cols, num_comp)

  ## stats features on g and c cols
  stats_df = add_stats_features(pca_df, g_cols, c_cols)
  
  ## add k-means features
  kmeans_df = add_kmeans_features(stats_df, g_cols='g_normalized', c_cols='c_normalized', k=num_cluster, num_iter=num_iter)
  
  return kmeans_df

In [11]:
## add indicator column to both train and test so we can combine them later
train_df = train_df.withColumn('is_test', lit(0))
test_df = test_df.withColumn('is_test', lit(1))

## Combine train and test df
full_df = train_df.union(test_df)

In [12]:

## encode features
target_cols = ['cp_type', 'cp_dose']
encoded_df = encode_cat_features(full_df, target_cols)

In [13]:
## feature engineering
fe_df = feature_engineering(encoded_df, num_comp=20, num_iter=5)

## select all the feature columns

pca_cols = list(filter(lambda v: re.match('.+_pca', v), fe_df.columns))
stats_cols = list(filter(lambda v: re.match('.+_stats', v), fe_df.columns))
k_means_cols = list(filter(lambda v: re.match('.+_k_mean', v), fe_df.columns))
cat_cols = list(filter(lambda v: re.match('.+_encoded', v), fe_df.columns)) + ['cp_time']

## stack them to a single feature vector
vector_assember_train = VectorAssembler(inputCols=pca_cols + stats_cols + k_means_cols + cat_cols, outputCol='all_features')
fe_df = vector_assember_train.transform(fe_df)

## normalize all the features
normalizer = StandardScaler(withMean=True)
cols = ['all_features']
output_cols = 'features'
fe_df = normalize_features(fe_df, cols, normalizer, output_cols, if_drop=False)

## split train, test df
fe_train = fe_df.filter(fe_df['is_test'] == 0)
final_test = fe_df.filter(fe_df['is_test'] == 1).select(['sig_id', 'features'])

## stack the training target
vector_assember_target = VectorAssembler(inputCols=[c for c in target_df.columns if c not in {'sig_id'}], outputCol='targets')
vectorized_target = vector_assember_target.transform(target_df).select(['sig_id', 'targets'])

## join training target with training features
final_train = fe_train.join(vectorized_target, ['sig_id']).select(['sig_id', 'features', 'targets'])

In [14]:
## train test split
(train, validation) = final_train.randomSplit([0.8, 0.2], 16)

train.cache()
validation.cache()

DataFrame[sig_id: string, features: vector, targets: vector]

In [156]:
class EnsembleBaseClfs:

    def __init__(self, base_clf, num_classes, feature_col, target_col, output_col='prediction'):
        
        self.base_clf = base_clf
        self.num_classes = num_classes
        self.feature_col = feature_col
        self.target_col = target_col
        self.output_col = output_col
        
        self._predict_cols = []
        self._trained_clfs = []

    def fit(self, train):
        
        for c in tqdm(range(num_classes)):
            
            predict_col_name = f'prob_{c}'
            temp_clf = self.base_clf.copy()
            temp_train = train.withColumn('new_target', vector_to_array(self.target_col)[c])
            
            temp_clf.setLabelCol('new_target')
            temp_clf.setFeaturesCol(self.feature_col)
            
            try:
                temp_clf.setProbabilityCol(predict_col_name)
            except e:
                print('classifier cannot output probability')
                temp_clf.setPredictionCol(predict_col_name)
                
            temp_clf = temp_clf.fit(temp_train)
            self._trained_clfs.append(temp_clf)
            self._predict_cols.append(predict_col_name)

        return self

    def predict(self, x_test, to_pandas=False):
        
        curr_df = x_test
        input_cols = x_test.columns
        
        for i, clf in tqdm(enumerate(self._trained_clfs)):
            
            ## select current probability column prob_i
            prob_col = self._predict_cols[i]
            input_cols.append(prob_col)
            curr_df = clf.transform(curr_df)
            curr_df = curr_df.withColumn(prob_col, vector_to_array(prob_col)[1])
            curr_df = curr_df.select(input_cols)
        
        ## transform columns prob_0, ..., prob_c to a vector with name self.output_col
        
        if to_pandas:
            
            return curr_df.toPandas()
        
        else:
            va = VectorAssembler(inputCols=self._predict_cols, outputCol=self.output_col)
            ## transform this vector self.output_col to an array
            output_df = va.transform(curr_df).withColumn(self.output_col, vector_to_array(self.output_col)).select(x_test.columns + [self.output_col])
        
        return output_df
    
    
    def evaluate(self, output_df):
        
        ## transform target vector to array
        output_df = output_df.withColumn(self.target_col, vector_to_array(self.target_col))
        
        def score(df, predictCol, targetsCol):
            import math
            @udf('double')
            def log_loss(y, y_hat):
                r = 0
                cut = 10**(-15)
                for t, p in zip(y, y_hat):
                    p = max(min(p, 1-cut),cut)
                    r += t * math.log(p) + (1 - t) * math.log(1 - p)
                return r/len(y)
            log_loss = df.select(log_loss(targetsCol, predictCol).alias('log_loss'))
            return log_loss.select((-_mean(col('log_loss'))).alias('score'))
        
        ## calcualte score
        return score(output_df, self.output_col, self.target_col)

        

In [157]:
base_clf = LogisticRegression(maxIter=2)
num_classes = 206
feature_col = 'features'
target_col = 'targets'
clf = EnsembleBaseClfs(base_clf, num_classes, feature_col, target_col).fit(train)

100%|████████████████████████████████████████████████████████████████████████████████| 206/206 [07:39<00:00,  2.23s/it]


In [169]:
output_df = clf.predict(validation)
clf.evaluate(output_df)

206it [01:25,  2.41it/s]


DataFrame[score: double]

In [170]:
clf.evaluate(output_df).show()

+-------------------+
|              score|
+-------------------+
|0.01776026006612206|
+-------------------+

