In [1]:
from pyspark.sql import SparkSession

In [2]:
sparl = SparkSession.builder.appName('mylogreg').getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [4]:
my_data = spark.read.format('libsvm').load('FileStore/tables/sample_libsvm_data.txt')

In [5]:
my_data.show()

In [6]:
my_logit = LogisticRegression()

In [7]:
fitted_logit_model = my_logit.fit(my_data)

In [8]:
log_summary = fitted_logit_model.summary

In [9]:
log_summary.predictions.printSchema()

In [10]:
log_summary.predictions.show()

In [11]:
# evaluators

# first, train test split
lr_train, lr_test = my_data.randomSplit([0.7, 0.3])

In [12]:
final_model = LogisticRegression()
fit_final = final_model.fit(lr_train)
prediction_and_labels = fit_final.evaluate(lr_test)

In [13]:
prediction_and_labels.predictions.show()

In [14]:
# evaluators
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [15]:
my_eval = BinaryClassificationEvaluator()

In [16]:
# the default metric is area under ROC
my_final_roc = my_eval.evaluate(prediction_and_labels.predictions)

In [17]:
my_final_roc

In [18]:
# working with a classic example --- Titanic survival
df = spark.read.csv('FileStore/tables/titanic.csv', inferSchema = True, header = True)


In [19]:
df.printSchema()

In [20]:
df.columns

In [21]:
my_cols = df.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [22]:
# deal with missing data --- drop missing data
my_final_data = my_cols.na.drop()

In [23]:
# deal with categorical variables
from pyspark.ml.feature import(VectorAssembler, VectorIndexer,
                              OneHotEncoder, StringIndexer)

In [24]:
# one hot encoding
gender_indexer = StringIndexer(inputCol = 'Sex', outputCol = 'SexIndex')   # generate unique numeric index for each categorical val
gender_encoder = OneHotEncoder(inputCol = 'SexIndex', outputCol = 'SexVec')   # convert the categorical val to a vector

In [25]:
embark_indexer = StringIndexer(inputCol = 'Embarked', outputCol = 'EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol = 'EmbarkIndex', outputCol = 'EmbarkVec')

In [26]:
assembler = VectorAssembler(inputCols = ['Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare'], outputCol = 'features')

In [27]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [28]:
log_reg_titanic = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')

In [29]:
# create pipeline
pipeline = Pipeline(stages = [gender_indexer, embark_indexer,
                             gender_encoder, embark_encoder,
                             assembler, log_reg_titanic])

In [30]:
train_data, test_data = my_final_data.randomSplit([0.7, 0.3])

In [31]:
fit_model = pipeline.fit(train_data)

In [32]:
results = fit_model.transform(test_data)

In [33]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [34]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol = 'Survived')

In [35]:
results.select('Survived', 'prediction').show()

In [36]:
my_eval.evaluate(results) # will return AUC (area under the ROC curve)