### create spark session connecting to standalone cluster

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("spark://master0.ddoc.os.fyre.ibm.com:32701") \
    .appName("demo") \
    .config('spark.sql.codegen.wholeStage', 'false') \
    .getOrCreate()

### import spark sql and ml for data preparation

In [12]:
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer

### concat dataset

In [6]:
import os

parts = ["header"]
for dirpath, dirs, files in os.walk("/pfs/titanic"):
  for file in files:
    if file.startswith('part'):
      parts.append(file)

concat = ''.join([open(os.path.join(dirpath, f)).read() for f in parts])
f = open('/mnt/pfs/data.csv', 'w')
f.write(concat)
f.close()

part-1.csv
/pfs/titanic
[]
part-2.csv
/pfs/titanic
[]


### read in dataset

In [8]:
df = spark.read.csv('/mnt/pfs/data.csv', header=True)

In [13]:
df = df.select(col("Survived").cast("int"),col("PassengerId").cast("int"),col("Name"),col("Parch").cast("int"),col("Sex"),col("Embarked"),col("Pclass").cast("int"),col("Age").cast("double"),col("SibSp").cast("int"),col("Fare").cast("double"))

### handle rows with null value

Look up Name for clue of age to be assigned

In [64]:
df = df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))

In [65]:
df = df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

In [66]:
mean_ages = df.groupby('Initial').avg('Age')

In [67]:
def get_mean_age(x):
    return round(mean_ages.filter(mean_ages.Initial == x).select('avg(Age)').collect()[0][0])

Now impute the age by initial

In [68]:
df = df.withColumn("Age",when(df["Age"].isNull(), get_mean_age(df["Initial"])).otherwise(df["Age"]))

### Embarked null value, impute with the majority value of 'S'

In [69]:
df = df.na.fill({"Embarked" : 'S'})

### create Family_size and Alone for more analyze

In [70]:
df = df.withColumn("Family_Size",col('SibSp')+col('Parch'))

In [71]:
df = df.withColumn('Alone',lit(0))

In [72]:
df = df.withColumn("Alone",when(df["Family_Size"] == 0, 1).otherwise(df["Alone"]))

### transform, encoding

In [73]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in ["Sex","Embarked","Initial"]]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

### drop columns not used for features

In [74]:
df = df.drop("PassengerId","Name","Embarked","Sex","Initial")

### create features vector

In [75]:
feature = VectorAssembler(inputCols=df.columns[1:],outputCol="features")
feature_vector= feature.transform(df)

### split data into train and test (80/20)

In [76]:
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

### write data back to disk

In [77]:
trainingData.write.format("parquet").mode("overwrite").save("/mnt/pfs/train.parquet")

In [78]:
testData.write.format("parquet").mode("overwrite").save("/mnt/pfs/test.parquet")

### read in data

In [79]:
trainingData = spark.read.parquet("/mnt/pfs/train.parquet")

### run training GBT (Gradient-boosted tree classifier)

In [81]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="Survived", featuresCol="features",maxIter=10, seed=20, maxDepth=4)
gbt_model = gbt.fit(trainingData)

In [82]:
gbt_model.write().overwrite().save("/mnt/pfs/gbt")

### run scoring

### read in model and data

In [83]:
from pyspark.ml.classification import GBTClassificationModel
gbt_model = GBTClassificationModel.load("/mnt/pfs/gbt")
testData = spark.read.parquet("/mnt/pfs/test.parquet")

In [84]:
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
gbt_prediction = gbt_model.transform(testData)
gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Accuracy of Gradient-boosted tree classifie is = %g"% (gbt_accuracy))
print("Test Error of Gradient-boosted tree classifie %g"% (1.0 - gbt_accuracy))

Accuracy of Gradient-boosted tree classifie is = 0.802326
Test Error of Gradient-boosted tree classifie 0.197674


In [85]:
spark.stop()