In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Titanic_MLlib").getOrCreate()
# Load the dataset
data = spark.read.csv("titanic.csv", header=True, inferSchema=True)

In [3]:
# Data preprocessing
data = data.select("Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare")
data = data.dropna()

In [4]:
# Convert categorical column "Sex" to numeric
indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
data = indexer.fit(data).transform(data)
data = data.drop("Sex")

In [5]:
# Assemble features
assembler = VectorAssembler(inputCols=["Pclass", "SexIndex", "Age", "SibSp", "Parch", "Fare"], 
                            outputCol="features")
data = assembler.transform(data).select("Survived", "features")

In [6]:
# Split dataset into training and testing
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [7]:
# Train logistic regression model
lr = LogisticRegression(labelCol="Survived", featuresCol="features")
model = lr.fit(train_data)

In [8]:
# Make predictions
predictions = model.transform(test_data)

In [9]:
# Evaluate model
evaluator = BinaryClassificationEvaluator(labelCol="Survived")


In [10]:
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")



AUC: 0.8754629629629628


In [None]:
# Stop Spark session
spark.stop()

In [12]:
def addNumbers(a,b):
    sum = a+b
    return sum

num1 = int(input())
num2 = int(input())

print(f'The sum of the two numbers is: {addNumbers(num1,num2)}')    

ValueError: invalid literal for int() with base 10: ''

In [13]:
n = range(0,3)

In [14]:
print(n)

range(0, 3)


In [15]:
range(3)

range(0, 3)

In [16]:
list(range(3))

[0, 1, 2]

In [17]:
spark.stop()