In [1]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, OneHotEncoderEstimator, StringIndexer
from pyspark.sql import SparkSession, DataFrame, functions as F
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline, Transformer, PipelineModel
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable, DefaultParamsReader 
from pyspark.ml.param.shared import HasOutputCols, Param, Params, TypeConverters
from pyspark import keyword_only

In [2]:
spark.conf.set(
  "fs.azure.account.key.<storage-account-name>.blob.core.windows.net",
  "<account-key>")

In [3]:
data = spark.read.csv("wasbs://data@sparkmltrainig.blob.core.windows.net/train.csv", inferSchema = True, header = True)

train_df, test_df = data.randomSplit([0.9, 0.1], seed=42)

In [4]:
train_df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in train_df.columns]).show()
test_df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in test_df.columns]).show()

In [5]:
# first way of data engineering - methods
def evaluate_initials(df: DataFrame) -> DataFrame:
  dizip_initials = {k:v for k,v in (zip(['Mlle','Mme','Ms','Dr', 'Major','Lady','Countess', 'Jonkheer','Col','Rev', 'Capt','Sir','Don'], ['Miss','Miss','Miss', 'Mr','Mr','Mrs','Mrs', 'Other','Other','Other', 'Mr','Mr','Mr']))}
  _df = df.withColumn('Initial',  F.regexp_extract( df['Name'], ('([A-Za-z]+)\.'),1 ) )
  _df = _df.replace(dizip_initials,1,'Initial')
  return _df
  
def handle_missing_age(df: DataFrame) -> DataFrame:
    _df = df
    _df = _df.withColumn('Age', 
           F.when((F.isnull(_df['Age'])) & (_df['Initial'] == 'Mr') , 33 )\
            .otherwise(F.when((F.isnull(_df['Age'])) 
                              & (_df['Initial'] == 'Mrs') , 36)\
            .otherwise(F.when((F.isnull(_df['Age'])) 
                              & (_df['Initial'] == 'Master') , 5)\
            .otherwise(F.when((F.isnull(_df['Age'])) 
                              & (_df['Initial'] == 'Miss') , 22)\
            .otherwise(F.when((F.isnull(_df['Age'])) 
                              & (_df['Initial'] == 'Other') , 46)\
            .otherwise(_df['Age']) )))))
    return _df

def create_family_size(df: DataFrame) -> DataFrame:
  _df = df.withColumn('FamilySize', df['Parch'] + df['SibSp'] + 1 )
  return _df

def create_is_alone(df: DataFrame) -> DataFrame:
  _df = df.withColumn('IsAlone', F.when(df['FamilySize'] > 1, 0).otherwise(1))
  return _df

def create_fare_per_person(df: DataFrame) -> DataFrame:  
  _df = df.withColumn('FarePerPerson', df['Fare'] /df['FamilySize'])
  return _df

def drop_rows_with_null(df: DataFrame, col) -> DataFrame:
  _df = df = df.filter(df[col].isNotNull())
  return _df

def change_to_index(df: DataFrame, col) -> DataFrame:
  indexer = StringIndexer(inputCol=col, outputCol='{0}_indexed'.format(col))
  _df = indexer.fit(df).transform(df)
  return _df

def change_to_one_hot_encoded(df: DataFrame, cols) -> DataFrame:
  for col in cols:
    df = change_to_index(df, col)
  col_indexed = ['{0}_indexed'.format(col) for col in cols]
  col_encoded = ['{0}_encoded'.format(col) for col in cols]
  encoder = OneHotEncoderEstimator(inputCols=col_indexed, outputCols=col_encoded)
  _df = encoder.fit(df).transform(df)
  return _df

In [6]:
# data engineering with method 1
train_df = drop_rows_with_null(handle_missing_age(evaluate_initials(create_fare_per_person(create_is_alone(create_family_size(train_df))))), 'Embarked')
test_df = handle_missing_age(evaluate_initials(create_fare_per_person(create_is_alone(create_family_size(test_df)))))
train_df = change_to_one_hot_encoded(train_df, ['Sex', 'Initial', 'Embarked'])
test_df = change_to_one_hot_encoded(test_df, ['Sex', 'Initial', 'Embarked'])
cols_to_drop=['PassengerId', 'Cabin', 'Ticket', 'Name', 'Sex', 'Initial', 'Embarked']
train_df = train_df.drop(*cols_to_drop)
test_df = test_df.drop(*cols_to_drop)

In [7]:
# learning the model
cols = [c for c in train_df.columns if c != 'Survived']
vec_assembler = VectorAssembler(inputCols=cols, outputCol="features")
vec_train_DF = vec_assembler.transform(train_df)
lr = LinearRegression(featuresCol="features", labelCol="Survived")
lr_model = lr.fit(vec_train_DF)

In [8]:
# applying model to test data
vec_test_df = vec_assembler.transform(test_df)
pred_df = lr_model.transform(vec_test_df)

In [9]:
# evaluating the model
regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Survived", metricName="rmse")

rmse = regression_evaluator.evaluate(pred_df)
print(f"RMSE is {rmse}")

In [10]:
# second way of data engineering - custom Transformators
# hint: transform method should contain the same code as in the analogous method above

class InitialsTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable, DefaultParamsReader):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage

class MissingAgeTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
 # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage
  
class FamilySizeTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage
  
class IsAloneTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage
  
class FarePerPersonTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage
  
class DropRowsWithNullTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage
  
class CategoryToIndexTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage
  
class CategoryToOneHotEncodedTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage
  
class DropStringColsTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
  # TODO implement a transformer following the instructions from https://akisiel1.github.io/introSparkML/#create-custom-pipeline-stage

In [11]:
data = spark.read.csv("wasbs://data@sparkmltrainig.blob.core.windows.net/train.csv", inferSchema = True, header = True)

train_df, test_df = data.randomSplit([0.9, 0.1], seed=42)

In [12]:
stages = []
# TODO create transformes objects and use it in stage array

# hint: for vector asembler class:
cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'FarePerPerson', 'Sex_indexed', 'Initial_indexed', 'Embarked_indexed', 'Sex_encoded', 'Initial_encoded', 'Embarked_encoded']


pipeline = Pipeline(stages=stages)

pipelineModel = pipeline.fit(train_df)

In [13]:
pipelinePath = "/tmp/lr_pipeline_model"
pipelineModel.write().overwrite().save(pipelinePath)

In [14]:
savedPipelineModel = PipelineModel.load(pipelinePath)
pred_df = savedPipelineModel.transform(test_df)

In [15]:
regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Survived", metricName="rmse")

rmse = regression_evaluator.evaluate(pred_df)
print(f"RMSE is {rmse}")

In [16]:
# TODO compare if RMSE for both cases is the same