In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("Titanic_MLlib").getOrCreate()

# Load the dataset
data = spark.read.csv("titanic.csv", header=True, inferSchema=True)

# Data preprocessing
data = data.select("Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare")
data = data.dropna()

# Convert categorical column "Sex" to numeric
indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
data = indexer.fit(data).transform(data)
data = data.drop("Sex")

# Assemble features
assembler = VectorAssembler(inputCols=["Pclass", "SexIndex", "Age", "SibSp", "Parch", "Fare"], 
                            outputCol="features")
data = assembler.transform(data).select("Survived", "features")

# Split dataset into training and testing
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train logistic regression model
lr = LogisticRegression(labelCol="Survived", featuresCol="features")
model = lr.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate model
evaluator = BinaryClassificationEvaluator(labelCol="Survived")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

# Stop Spark session
spark.stop()
