In [1]:
from pyspark.sql import SQLContext, SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [2]:
conf = SparkConf()
sc = SparkContext()
sqlcontext = SQLContext(sc)

In [3]:
from pathlib import Path
import os

In [4]:
SURVIVOR_CSV_DATA = 'titanic_result.csv'

In [5]:
# RESULT OF PEOPLE SURVIVED IN TITANIC CLASH
survivor_data_df = sqlcontext.read.csv(SURVIVOR_CSV_DATA, sep=",", header=True)

In [6]:
ACTUAL_TRAVELLER_DATA = 'titanic_data.csv'

In [7]:
# ACTUAL LIST OF PEOPLE SURVIVED IN TITANIC CLASH
actual_data_df = sqlcontext.read.csv(ACTUAL_TRAVELLER_DATA, sep=",", header=True)

In [8]:
# filter survived data
survivor_data = survivor_data_df.filter(survivor_data_df.Survived == '1').rdd
survivor_filtered_df = survivor_data.toDF()

In [9]:
# List of survived passengers
survivor_flag = survivor_data.collectAsMap()

In [10]:
def survior_data_filter(row):
    if row['PassengerId'] in survivor_flag:
        return row

In [11]:
filtered_data = actual_data_df.rdd.map(survior_data_filter)

In [14]:
records = filtered_data.take(10)
for record in records:
    if record:
        print(record)

Row(PassengerId='893', Pclass='3', Name='Wilkes, Mrs. James (Ellen Needs)', Sex='female', Age='47', SibSp='1', Parch='0', Ticket='363272', Fare='7', Cabin=None, Embarked='S')
Row(PassengerId='896', Pclass='3', Name='Hirvonen, Mrs. Alexander (Helga E Lindqvist)', Sex='female', Age='22', SibSp='1', Parch='1', Ticket='3101298', Fare='12.2875', Cabin=None, Embarked='S')
Row(PassengerId='900', Pclass='3', Name='Abrahim, Mrs. Joseph (Sophie Halaut Easu)', Sex='female', Age='18', SibSp='0', Parch='0', Ticket='2657', Fare='7.2292', Cabin=None, Embarked='C')


In [13]:
survivor_data_1 = survivor_data.map(lambda x: x[0])
filtered_data_1 = actual_data_df.rdd.map(lambda x: (x[0], x)).join(survivor_data)

In [15]:
records = filtered_data_1.take(10)
for record in records:
    print(record)

('900', (Row(PassengerId='900', Pclass='3', Name='Abrahim, Mrs. Joseph (Sophie Halaut Easu)', Sex='female', Age='18', SibSp='0', Parch='0', Ticket='2657', Fare='7.2292', Cabin=None, Embarked='C'), '1'))
('912', (Row(PassengerId='912', Pclass='1', Name='Rothschild, Mr. Martin', Sex='male', Age='55', SibSp='1', Parch='0', Ticket='PC 17603', Fare='59.4', Cabin=None, Embarked='C'), '1'))
('914', (Row(PassengerId='914', Pclass='1', Name='Flegenheim, Mrs. Alfred (Antoinette)', Sex='female', Age=None, SibSp='0', Parch='0', Ticket='PC 17598', Fare='31.6833', Cabin=None, Embarked='S'), '1'))
('915', (Row(PassengerId='915', Pclass='1', Name='Williams, Mr. Richard Norris II', Sex='male', Age='21', SibSp='0', Parch='1', Ticket='PC 17597', Fare='61.3792', Cabin=None, Embarked='C'), '1'))
('924', (Row(PassengerId='924', Pclass='3', Name='Dean, Mrs. Bertram (Eva Georgetta Light)', Sex='female', Age='33', SibSp='1', Parch='2', Ticket='C.A. 2315', Fare='20.575', Cabin=None, Embarked='S'), '1'))
('925',

In [16]:
final_data = filtered_data_1.map(lambda x : x[1][0])

In [None]:
# final_data_df =  final_data.toDF()

In [None]:
final_data_df.printSchema()

In [17]:
db_schema = StructType([
    StructField('PassengerId', StringType(), True),
    StructField('Pclass', StringType(), True),
    StructField('Name', StringType(), True),
    StructField('Sex', StringType(), True),
    StructField('Age', StringType(), True),
    StructField('SibSp', StringType(), True),
    StructField('Parch', StringType(), True),
    StructField('Ticket', StringType(), True),
    StructField('Fare', StringType(), True),
    StructField('Cabin', StringType(), True),
    StructField('Embarked', StringType(), True),
])

In [18]:
final_data_df = sqlcontext.createDataFrame(final_data, db_schema)
final_data_df.coalesce(1).write.csv('survivor_data_complete', sep=",", header=True)

In [None]:
# Alernate to save the data as csv
'''
import csv

records = final_data.collect()   
with open('final_data.csv', 'w') as csv_file:
    field_names = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
    writer = csv.DictWriter(csv_file, fieldnames=field_names, delimiter=',')
    writer.writeheader()
    for row in records:
        row_data = {}
        for name in field_names:
            row_data[name] = row[name]
        writer.writerow(row_data)'''