In [100]:
import pyspark
sc = pyspark.SparkContext('local[*]')

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-1-4bc56731f3c8>:2 

In [101]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

### Step 1
- Load the train and test sets
- Check the schema, the variables have their right types?
- If not, how to correctly load the datasets?

In [102]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

customSchema = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Survived", DoubleType(), True),
                           StructField("Pclass", IntegerType(), True), 
                           StructField("Name", StringType(), True),
                           StructField("Sex", StringType(), True),
                           StructField("Age", DoubleType(), True),
                           StructField("SibSp", IntegerType(), True),
                           StructField("Parch", IntegerType(), True),
                           StructField("Ticket", StringType(), True),
                           StructField("Fare", DoubleType(), True),
                           StructField("Cabin", StringType(), True),
                           StructField("Embarked", StringType(), True)])

customSchema2 = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Pclass", IntegerType(), True), 
                           StructField("Name", StringType(), True),
                           StructField("Sex", StringType(), True),
                           StructField("Age", DoubleType(), True),
                           StructField("SibSp", IntegerType(), True),
                           StructField("Parch", IntegerType(), True),
                           StructField("Ticket", StringType(), True),
                           StructField("Fare", DoubleType(), True),
                           StructField("Cabin", StringType(), True),
                           StructField("Embarked", StringType(), True)])

train = sqlc.read.csv("train.csv", header=True, schema=customSchema)
test = sqlc.read.csv("test.csv", header=True, schema=customSchema2)

### Step 2
- Explore the features of your dataset
- You can use DataFrame's ***describe*** method to get summary statistics
    - hint: ***toPandas*** may be useful to ease the manipulation of small dataframes
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: ***na*** property of DataFrames provide several methods of handling NA values

In [103]:
# Calculating summary statistics and turning it into Pandas DF
train_desc = train.describe().toPandas().set_index('summary')
train_desc

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
mean,446.0,0.3838383838383838,2.308641975308642,,,29.69911764705882,0.5230078563411896,0.3815937149270482,260318.54916792738,32.2042079685746,,
stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,,,14.526497332334037,1.1027434322934315,0.8060572211299488,471609.26868834975,49.69342859718089,,
min,1.0,0.0,1.0,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""",female,0.42,0.0,0.0,110152,0.0,A10,C
max,891.0,1.0,3.0,"van Melkebeke, Mr. Philemon",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [104]:
train.groupBy('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    null|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [105]:
# Computing correlations between Survived and some features
print({col:train.stat.corr('Survived',col) 
       for col in ['Pclass','Age','SibSp','Parch','Fare']})

{'Pclass': -0.3384810359610151, 'Age': 0.010539215871285682, 'Fare': 0.2573065223849626, 'Parch': 0.08162940708348339, 'SibSp': -0.0353224988857356}


In [106]:
# Checking which columns have NULL values
print({col:train.where(train[col].isNull()).count() 
       for col in train.columns})

{'Pclass': 0, 'Ticket': 0, 'Fare': 0, 'SibSp': 0, 'Name': 0, 'Parch': 0, 'PassengerId': 0, 'Age': 177, 'Embarked': 2, 'Cabin': 687, 'Sex': 0, 'Survived': 0}


In [107]:
# Taking the mean age from the Pandas DF
ageMean = float(train_desc.loc['mean']['Age'])
print(ageMean)

29.69911764705882


In [108]:
# Filling the Age in both train and test datasets
trainFilled = train.na.fill({'Age': ageMean, 'Embarked': 'S'})
testFilled = test.na.fill({'Age': ageMean, 'Embarked': 'S'})

In [109]:
from pyspark.sql import functions as F
train.groupby('Sex','PClass').agg(F.mean('age')).show()

+------+------+------------------+
|   Sex|PClass|          avg(age)|
+------+------+------------------+
|  male|     3|26.507588932806325|
|female|     3|             21.75|
|female|     1| 34.61176470588235|
|female|     2|28.722972972972972|
|  male|     2| 30.74070707070707|
|  male|     1| 41.28138613861386|
+------+------+------------------+



### Step 3
- How to handle categorical features?
    - hint: check the Estimators and Transformers
- Assemble all desired features into a Vector using the VectorAssembler Transformer
- Make sure to end up with a DataFrame with two columns: ***Survived*** and ***vFeatures***

In [110]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils

indexer1 = (StringIndexer()
           .setInputCol("Embarked")
           .setOutputCol("nEmbarked")
           .setHandleInvalid('skip'))

indexed1 = indexer1.fit(trainFilled).transform(trainFilled)

In [111]:
indexed1.select('Embarked','nEmbarked').show(20)

+--------+---------+
|Embarked|nEmbarked|
+--------+---------+
|       S|      0.0|
|       C|      1.0|
|       S|      0.0|
|       S|      0.0|
|       S|      0.0|
|       Q|      2.0|
|       S|      0.0|
|       S|      0.0|
|       S|      0.0|
|       C|      1.0|
|       S|      0.0|
|       S|      0.0|
|       S|      0.0|
|       S|      0.0|
|       S|      0.0|
|       S|      0.0|
|       Q|      2.0|
|       S|      0.0|
|       S|      0.0|
|       C|      1.0|
+--------+---------+
only showing top 20 rows



In [112]:
indexer2 = (StringIndexer()
           .setInputCol("Sex")
           .setOutputCol("nSex")
           .setHandleInvalid('skip'))

indexed2 = indexer2.fit(indexed1).transform(indexed1)

In [113]:
encoder1 = OneHotEncoder().setInputCol("nEmbarked").setOutputCol("vEmbarked")
encoded1 = encoder1.transform(indexed2)

In [114]:
encoded1.select('nEmbarked','vEmbarked').show(10)

+---------+-------------+
|nEmbarked|    vEmbarked|
+---------+-------------+
|      0.0|(2,[0],[1.0])|
|      1.0|(2,[1],[1.0])|
|      0.0|(2,[0],[1.0])|
|      0.0|(2,[0],[1.0])|
|      0.0|(2,[0],[1.0])|
|      2.0|    (2,[],[])|
|      0.0|(2,[0],[1.0])|
|      0.0|(2,[0],[1.0])|
|      0.0|(2,[0],[1.0])|
|      1.0|(2,[1],[1.0])|
+---------+-------------+
only showing top 10 rows



In [115]:
encoder2 = OneHotEncoder().setInputCol("nSex").setOutputCol("vSex").setDropLast(False)
encoded2 = encoder2.transform(encoded1)

In [116]:
encoded2.select('Sex', 'nSex','vSex').show(10)

+------+----+-------------+
|   Sex|nSex|         vSex|
+------+----+-------------+
|  male| 0.0|(2,[0],[1.0])|
|female| 1.0|(2,[1],[1.0])|
|female| 1.0|(2,[1],[1.0])|
|female| 1.0|(2,[1],[1.0])|
|  male| 0.0|(2,[0],[1.0])|
|  male| 0.0|(2,[0],[1.0])|
|  male| 0.0|(2,[0],[1.0])|
|  male| 0.0|(2,[0],[1.0])|
|female| 1.0|(2,[1],[1.0])|
|female| 1.0|(2,[1],[1.0])|
+------+----+-------------+
only showing top 10 rows



In [117]:
# Using a VectorAssembler to put together all feature columns
assembler = VectorAssembler(inputCols=['Pclass',
                                       'Age',
                                       'SibSp',
                                       'Parch',
                                       'Fare',
                                       'vSex',
                                       'vEmbarked'], 
                            outputCol='vFeatures')

assembled = assembler.transform(encoded2)

In [118]:
# Keeping only the features and label columns to 
assembled2 = assembled.select("Survived","vFeatures")

In [119]:
assembled2.show(10)

+--------+--------------------+
|Survived|           vFeatures|
+--------+--------------------+
|     0.0|[3.0,22.0,1.0,0.0...|
|     1.0|[1.0,38.0,1.0,0.0...|
|     1.0|[3.0,26.0,0.0,0.0...|
|     1.0|[1.0,35.0,1.0,0.0...|
|     0.0|[3.0,35.0,0.0,0.0...|
|     0.0|(9,[0,1,4,5],[3.0...|
|     0.0|[1.0,54.0,0.0,0.0...|
|     0.0|[3.0,2.0,3.0,1.0,...|
|     1.0|[3.0,27.0,0.0,2.0...|
|     1.0|[2.0,14.0,1.0,0.0...|
+--------+--------------------+
only showing top 10 rows



### Step 4
- Apply a normalization Estimator of your choice to the ***features*** vector obtained in Step 3

In [120]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler().setInputCol("vFeatures").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)
scalerModel = scaler.fit(assembled2)
scaled = scalerModel.transform(assembled2)

In [121]:
scaled.show(10)

+--------+--------------------+--------------------+
|Survived|           vFeatures|          scaledFeat|
+--------+--------------------+--------------------+
|     0.0|[3.0,22.0,1.0,0.0...|[0.82691281652436...|
|     1.0|[1.0,38.0,1.0,0.0...|[-1.5652278312782...|
|     1.0|[3.0,26.0,0.0,0.0...|[0.82691281652436...|
|     1.0|[1.0,35.0,1.0,0.0...|[-1.5652278312782...|
|     0.0|[3.0,35.0,0.0,0.0...|[0.82691281652436...|
|     0.0|(9,[0,1,4,5],[3.0...|[0.82691281652436...|
|     0.0|[1.0,54.0,0.0,0.0...|[-1.5652278312782...|
|     0.0|[3.0,2.0,3.0,1.0,...|[0.82691281652436...|
|     1.0|[3.0,27.0,0.0,2.0...|[0.82691281652436...|
|     1.0|[2.0,14.0,1.0,0.0...|[-0.3691575073769...|
+--------+--------------------+--------------------+
only showing top 10 rows



### Step 5
- Instead of doing transformations on separate steps, put everything together with a Pipeline

In [122]:
from pyspark.ml.pipeline import Pipeline

pipeline = Pipeline(stages=[indexer1,
                            indexer2,
                            encoder1, 
                            encoder2, 
                            assembler,
                            scaler])

In [123]:
model = pipeline.fit(trainFilled)
scaled = model.transform(trainFilled)

In [124]:
scaled.select('Survived', 'scaledFeat').show(10)

+--------+--------------------+
|Survived|          scaledFeat|
+--------+--------------------+
|     0.0|[0.82691281652436...|
|     1.0|[-1.5652278312782...|
|     1.0|[0.82691281652436...|
|     1.0|[-1.5652278312782...|
|     0.0|[0.82691281652436...|
|     0.0|[0.82691281652436...|
|     0.0|[-1.5652278312782...|
|     0.0|[0.82691281652436...|
|     1.0|[0.82691281652436...|
|     1.0|[-0.3691575073769...|
+--------+--------------------+
only showing top 10 rows



### Step 6
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the evaluators to find the Area Under ROC and Accuracy of your model
- How is your model performing? Try to tune its parameters

In [125]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Trains a RF classifier and make predictions
rfC = RandomForestClassifier().setLabelCol("Survived") \
                                .setFeaturesCol("scaledFeat") \
                                .setNumTrees(50) \
        .setMaxDepth(10)

model = rfC.fit(scaled)

predictions = model.transform(scaled)

In [126]:
predictions.select('scaledFeat', 'probability', 'prediction', 'Survived').show(10)

+--------------------+--------------------+----------+--------+
|          scaledFeat|         probability|prediction|Survived|
+--------------------+--------------------+----------+--------+
|[0.82691281652436...|[0.88756433014797...|       0.0|     0.0|
|[-1.5652278312782...|[0.00328787878787...|       1.0|     1.0|
|[0.82691281652436...|[0.51233602722778...|       0.0|     1.0|
|[-1.5652278312782...|[0.00379713804713...|       1.0|     1.0|
|[0.82691281652436...|[0.88947540903175...|       0.0|     0.0|
|[0.82691281652436...|[0.88520457183290...|       0.0|     0.0|
|[-1.5652278312782...|[0.82292661054867...|       0.0|     0.0|
|[0.82691281652436...|[0.86234908472870...|       0.0|     0.0|
|[0.82691281652436...|[0.29421977887721...|       1.0|     1.0|
|[-0.3691575073769...|[0.01261111111111...|       1.0|     1.0|
+--------------------+--------------------+----------+--------+
only showing top 10 rows



In [127]:
model.featureImportances

SparseVector(9, {0: 0.1276, 1: 0.1625, 2: 0.0524, 3: 0.0415, 4: 0.1664, 5: 0.203, 6: 0.2058, 7: 0.022, 8: 0.0188})

In [128]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Defines an evaluator based on the metric areaUnderROC
evaluator = BinaryClassificationEvaluator().setLabelCol("Survived") \
                            .setRawPredictionCol("rawPrediction") \
                            .setMetricName("areaUnderROC")

# Evaluate the predictions
roc = evaluator.evaluate(predictions)

print(roc)

0.9627073147349255


In [129]:
ev2 = (MulticlassClassificationEvaluator()
       .setLabelCol('Survived')
       .setPredictionCol('prediction')
       .setMetricName('accuracy'))

acc = ev2.evaluate(predictions)
print(acc)

0.9034792368125701


### Step 7
- Take a look at the test data - use DataFrame's ***createOrReplaceTempView*** method to perform SQL queries over the data
    - hint: check if there are any NULL values in the dataset - if so, handle them
- Apply the transformations to the test data
    - hint: include the model to the pipeline
- Make predictions using the model previously trained and the transformed test data

In [130]:
# Make the test set a "table"
testFilled.createOrReplaceTempView('test')

# Runs a series of SQL queries to get the number of null values in the test set
print({col: sqlc.sql("select * from test where " + col + " is null").count() for col in testFilled.columns})

{'Pclass': 0, 'Ticket': 0, 'Age': 0, 'Sex': 0, 'SibSp': 0, 'PassengerId': 0, 'Cabin': 327, 'Name': 0, 'Fare': 1, 'Parch': 0, 'Embarked': 0}


In [131]:
# So, there is one null Fare, let's check it
sqlc.sql("select * from test where Fare is null").toPandas()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [132]:
# Since the Fare is highly dependent on the class, it makes more sense to use the average for the given class
# But we need to take the average from the TRAINING set
trainFilled.createOrReplaceTempView('train')
avgFare = sqlc.sql("select mean(Fare) from train where Pclass = 3").take(1)[0][0]
print(avgFare)

# Fill the missing value with the calculated average
testFilled = testFilled.na.fill({'Fare': avgFare})

13.675550101832997


In [133]:
pipeline = Pipeline(stages=[indexer1,
                            indexer2,
                            encoder1, 
                            encoder2, 
                            assembler,
                            scaler,
                            rfC])

model = pipeline.fit(trainFilled)

In [134]:
predictions = model.transform(testFilled)

### Step 8

- Load the answers for the ***test*** data
- Combine it with your predictions into a single DataFrame
- Use the evaluator you created on ***Step 6***
- What was your score?

In [135]:
answers = sqlc.read.csv('titanic_answers.csv', header=True)
answers = answers.select('PassengerId',F.col('Survived').cast('Double'))

In [136]:
pred_answer = predictions.join(answers, on='PassengerId')

In [137]:
roc = evaluator.evaluate(pred_answer)
print(roc)

0.8179040895813053


In [138]:
acc = ev2.evaluate(pred_answer)
print(acc)

0.7703349282296651
