In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler, VectorSlicer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

In [0]:
train_df = spark.read.csv('/FileStore/tables/train_features.csv', header=True, inferSchema=True)
target_df = spark.read.csv('/FileStore/tables/train_targets_scored.csv', header=True, inferSchema=True)

In [0]:
train_df.persist()
target_df.persist()

In [0]:
train_df.printSchema()

In [0]:
target_df.printSchema()

In [0]:
def preproccess_features(df, cols, normalizer, excluded_col):
  
  cat_features = cols['cat_features']
  norm_features = cols['norm_features']
  normalizers = []
  
  indexed_cols = [''.join([col_name, '_indexed']) for col_name in cat_features]
  encoded_cols = [''.join([col_name, '_encoded']) for col_name in cat_features]
  vectorized_cols = [''.join([col_name, '_vectorized']) for col_name in norm_features]
  normalized_cols = [''.join([col_name, '_normalized']) for col_name in norm_features]
  vector_assemblers = [VectorAssembler(inputCols=[norm_features[i]], outputCol=vectorized_cols[i]) for i in range(len(norm_features))]
  string_indexers = [StringIndexer(inputCol=cat_features[i], outputCol=indexed_cols[i]) for i in range(len(cat_features))]

  for n in range(len(norm_features)):
    
    temp_normalizer = normalizer.copy()
    temp_normalizer.setInputCol(vectorized_cols[n])
    temp_normalizer.setOutputCol(normalized_cols[n])
    normalizers.append(temp_normalizer)
    
  encoder = OneHotEncoder(inputCols=indexed_cols, outputCols=encoded_cols)
  
  pipline = Pipeline(stages=string_indexers + vector_assemblers + normalizers + [encoder])
  
  prepro_df = pipline.fit(df).transform(df)
  prepro_df = prepro_df.drop(*indexed_cols + cat_features + vectorized_cols + norm_features)
  
  feature_assember = VectorAssembler(inputCols=[c for c in prepro_df.columns if c not in excluded_col], outputCol='features')
  prepro_df = feature_assember.transform(prepro_df).select(excluded_col + ['features'])
  
  return prepro_df

In [0]:
## preprocess train
target_cols = {'cat_features': ['cp_type', 'cp_dose'], 'norm_features': ['cp_time']}
normalizer = StandardScaler(withMean=True)
excluded_col = ['sig_id']
cleaned_train = preproccess_features(train_df, target_cols, normalizer, excluded_col)

## transform target
vectorizer = VectorAssembler(inputCols=[c for c in target_df.columns if c not in {'sig_id'}], outputCol='vectorized_target')
vectorized_target = vectorizer.transform(target_df).select(['sig_id', 'vectorized_target'])

## join two df
final_train = cleaned_train.join(vectorized_target.alias('target'), ['sig_id'])

In [0]:
## train test split
(train, validation) = final_train.randomSplit([0.8, 0.2], 16)

In [0]:
class EnsembleBaseClfs:

    def __init__(self, base_clf, num_classes, feature_col, target_col):

        self.base_clf = base_clf
        self._trained_clfs = []
        self.num_classes = num_classes
        self.feature_col = feature_col
        self.target_col = target_col

    def fit(self, train):

        for c in range(num_classes):
          
            print(c)
            temp_clf = self.base_clf.copy()
            temp_train = train.withColumn('new_target', vector_to_array(self.target_col)[c])
            
            temp_clf.setLabelCol('new_target')
            temp_clf.setFeaturesCol(self.feature_col)
            
            temp_clf = temp_clf.fit(temp_train)
            self._trained_clfs.append(temp_clf)

        return self

    def predict(self, x_test):

        ## TODO

In [0]:
## example with logistic Regression, first 5 classes

base_clf = LogisticRegression(maxIter=2)
num_classes = 5
feature_col = 'features'
target_col = 'vectorized_target'
clf = EnsembleBaseClfs(base_clf, num_classes, feature_col, target_col).fit(train)