In [1]:
import findspark
findspark.init('/home/spark/spark-2.1.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("DfLogRig").getOrCreate()

In [3]:
df = spark.read.csv('titanic.csv',inferSchema = True, header = True)

In [4]:
df.describe()

DataFrame[summary: string, PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

In [5]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [10]:
my_cols = df.select([
 'Survived',
 'Pclass','Sex',
 'Age',
 'SibSp',
 'Parch','Fare',
 'Embarked'])

In [23]:
my_final_data = my_cols.na.drop()
my_final_data.describe()

DataFrame[summary: string, Survived: string, Pclass: string, Sex: string, Age: string, SibSp: string, Parch: string, Fare: string, Embarked: string]

In [14]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [29]:
embark_indexer =  StringIndexer(inputCol = 'Sex',outputCol = 'SexIndex')
embark_encoder = OneHotEncoder(inputCol = 'SexIndex',outputCol ='SexVec')

In [30]:
gender_indexer =  StringIndexer(inputCol = 'Embarked',outputCol = 'EmbarkIndex')
gender_encoder = OneHotEncoder(inputCol = 'EmbarkIndex',outputCol ='EmbarkVec')

In [31]:
assembler =  VectorAssembler(inputCols = ['Pclass','SexVec','EmbarkVec','Age',
 'SibSp',
 'Parch','Fare'],outputCol = 'features')

In [32]:
from pyspark.ml.classification import LogisticRegression

In [33]:
from pyspark.ml import Pipeline

In [34]:
log_reg_titanic = LogisticRegression(featuresCol = 'features',labelCol='Survived')

In [35]:
pipeline = Pipeline(stages = [embark_indexer,embark_encoder,gender_indexer,gender_encoder,assembler,log_reg_titanic])

In [36]:
train_data,test_data  = my_final_data.randomSplit([0.7,0.3])

In [38]:
fit_model = pipeline.fit(train_data)

In [39]:
result = fit_model.transform(test_data)

In [40]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [44]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction' ,labelCol='Survived' )

In [45]:
result.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [None]:
AUC = my_eval.evaluate