In [1]:
df_train = spark.read.format('csv').options(header=True).load('/ps/train_prep.csv')
df_test = spark.read.format('csv').options(header=True).load('/ps/test_prep.csv')

In [2]:
from mmlspark.train import TrainClassifier, ComputeModelStatistics
from mmlspark.automl import FindBestModel
from pyspark.ml.classification import LogisticRegression, LinearSVC, NaiveBayes, DecisionTreeClassifier, RandomForestClassifier
from mmlspark.lightgbm import LightGBMClassifier
from mmlspark.vw import VowpalWabbitClassifier

In [3]:
# Prepare data for learning
train, test, validation = df_train.randomSplit([0.60, 0.20, 0.20], seed=123)

# Train the models on the 'train' data
# logistic regression
lrHyperParams = [0.05, 0.1, 0.2, 0.4]
logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]

lrmodels = [TrainClassifier(model=lrm, labelCol="target").fit(train) for lrm in logisticRegressions]
lrmodels.append(TrainClassifier(model=LightGBMClassifier(objective='binary'), labelCol="target").fit(train))

bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test)

# Get AUC on the validation dataset
predictions = bestModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
metrics.createOrReplaceTempView("classMetrics")
metrics.show()

In [4]:
import pandas as pd

# Get best model predictions on the test dataset
predictions = bestModel.transform(df_test)
scored = predictions.select('scored_probabilities').toPandas()

df = pd.read_csv('/dbfs/ps/sample_submission.csv')
df = df.drop(['target'], axis=1)
df['target'] = scored['scored_probabilities']

df.to_csv('/dbfs/ps/results-mmlspark.csv', index=False)