In [1]:
df_train = spark.read.format('csv').options(header=True).load('/titanic/train.csv')
df_test = spark.read.format('csv').options(header=True).load('/titanic/test.csv')

In [2]:
from mmlspark.train import TrainClassifier, ComputeModelStatistics
from mmlspark.automl import FindBestModel
from pyspark.ml.classification import LogisticRegression, LinearSVC, NaiveBayes, DecisionTreeClassifier, RandomForestClassifier
from mmlspark.lightgbm import LightGBMClassifier
from mmlspark.vw import VowpalWabbitClassifier

# Prepare data for learning
train, test, validation = df_train.randomSplit([0.60, 0.20, 0.20], seed=123)

# Train the models on the 'train' data
# logistic regression
lrHyperParams = [0.05, 0.1, 0.2, 0.4]
logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]

lrmodels = [TrainClassifier(model=lrm, labelCol="Survived").fit(train) for lrm in logisticRegressions]
lrmodels.append(TrainClassifier(model=LinearSVC(regParam=0.01), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=LinearSVC(regParam=0.05), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=LinearSVC(regParam=0.1), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=LinearSVC(regParam=0.5), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=LinearSVC(regParam=1.0), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=LinearSVC(regParam=1.5), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=LinearSVC(regParam=2.0), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=NaiveBayes(smoothing=0.01), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=NaiveBayes(smoothing=0.1), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=NaiveBayes(smoothing=1.0), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=NaiveBayes(smoothing=2.0), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=LightGBMClassifier(objective='binary'), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=DecisionTreeClassifier(), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=RandomForestClassifier(numTrees = 5), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=RandomForestClassifier(numTrees = 10), labelCol="Survived").fit(train))
lrmodels.append(TrainClassifier(model=VowpalWabbitClassifier(numPasses=10), labelCol="Survived").fit(train))

bestModel = FindBestModel(evaluationMetric="accuracy", models=lrmodels).fit(test)

# Get accuracy on the validation dataset
predictions = bestModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
metrics.createOrReplaceTempView("classMetrics")
metrics.show()

print("Best model's accuracy on validation set = " + "{0:.2f}%".format(metrics.first()["accuracy"] * 100))

In [3]:
import pandas as pd

# Get predictions on the test dataset
predictions = bestModel.transform(df_test)
scored = predictions.select('scored_labels').toPandas()
preds = pd.DataFrame({'PassengerId' : range(892,1310), 'Survived' : scored['scored_labels'].astype(int)})
preds.to_csv('/dbfs/titanic/results-mmlspark.csv', index=False)