[Pyspark blog](https://bryancutler.github.io/)

[ML w/ Pyspark](https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa)

[GitHub event types](https://developer.github.com/v3/activity/events/types/)

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import pandas as pd

from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, Binarizer#, OneHotEncoderEstimator, StringIndexer
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import helper as h

In [39]:
churn_data = h.get_merged_data()
cols = churn_data.columns

churn_data.head()

Row(login='0loky0', followers_count=0, following_count=1, blog=0, company=0, created_at='2011-05-24 20:15:25+00:00', public_repos_count=7, public_gists_count=0, hireable=1, updated_at='2019-01-09 15:03:59+00:00', time_between_first_last_event='10 days 15:29:06.000000000', last_event='2016-04-15 10:14:03 UTC', first_event='2016-04-04 18:44:57 UTC', frequency=19, second_period_event_count=0, CommitCommentEvent_count=0, CreateEvent_count=5, DeleteEvent_count=0, ForkEvent_count=2, GollumEvent_count=0, IssueCommentEvent_count=0, IssuesEvent_count=0, MemberEvent_count=0, PublicEvent_count=0, PullRequestEvent_count=0, PullRequestReviewCommentEvent_count=0, PushEvent_count=12, ReleaseEvent_count=0, WatchEvent_count=0)

In [40]:
'''numeric_features = [t for t in churn_data if churn_data[t].dtype == np.dtype('float64') or 
                    churn_data[t].dtype == np.dtype('int32')]
print(numeric_features)'''

"numeric_features = [t for t in churn_data if churn_data[t].dtype == np.dtype('float64') or \n                    churn_data[t].dtype == np.dtype('int32')]\nprint(numeric_features)"

In [41]:
churn_data = churn_data.filter(churn_data.frequency < 200)

In [42]:
numeric_features = [t[0] for t in churn_data.dtypes if t[1] == 'int']
churn_data.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
followers_count,163457,4.488831925215806,41.3957025090698,0,6591
following_count,163457,4.321154799121482,79.95404469619254,0,19767
blog,163457,0.17065650293349321,0.37620968464066984,0,1
company,163457,0.1588674697321008,0.3655535724258656,0,1
public_repos_count,163372,11.300063658399235,29.03016791218304,0,2196
public_gists_count,163439,1.3336351788740752,8.56490255725305,0,766
hireable,163457,0.09987336118979304,0.2998319911240843,0,1
frequency,163457,21.26353719938577,32.21430591035925,2,199
second_period_event_count,163457,18.385238931339742,60.587401530838825,0,7752


In [43]:
#pd.plotting.boxplot

In [44]:
'''axs = pd.plotting.scatter_matrix(numeric_data[:200], figsize=(8, 8));
n = len(numeric_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())'''

"axs = pd.plotting.scatter_matrix(numeric_data[:200], figsize=(8, 8));\nn = len(numeric_data.columns)\nfor i in range(n):\n    v = axs[i, 0]\n    v.yaxis.label.set_rotation(0)\n    v.yaxis.label.set_ha('right')\n    v.set_yticks(())\n    h = axs[n-1, i]\n    h.xaxis.label.set_rotation(90)\n    h.set_xticks(())"

In [45]:
numeric_features.remove('second_period_event_count').remove('frequency')
print(numeric_features)

AttributeError: 'NoneType' object has no attribute 'remove'

In [None]:
stages = []
# binarizer needs double type or it throws an error.
churn_data = churn_data.withColumn("second_period_event_count", churn_data.second_period_event_count.cast(DoubleType()))
binarizer = Binarizer(threshold=0.5, inputCol="second_period_event_count", outputCol="label")

stages += [binarizer]
assembler = VectorAssembler(inputCols=numeric_features, outputCol="features").setHandleInvalid("skip")
stages += [assembler]

In [None]:
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(churn_data)
churn_data = pipelineModel.transform(churn_data)
selectedCols = ['label', 'features'] + cols
churn_data = churn_data.select(selectedCols)
churn_data.printSchema()

In [None]:
pd.DataFrame(churn_data.take(5), columns=churn_data.columns).transpose()

In [None]:
train, test = churn_data.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: {0}".format(train.count()))
print("Test Dataset Count: {0}".format(test.count()))

In [None]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10,
                       regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train)

In [None]:
beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
plt.show()
lrModel.featuresCol

In [None]:
pd.DataFrame({'features': numeric_features, 
              'weights': lrModel.coefficients.values}
            ).sort_values(by='weights', ascending=False)


In [None]:
trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [16]:
predictions = lrModel.transform(test)
show_cols = ['followers_count', 'company', 'blog', 
             'label', 'rawPrediction', 'prediction', 'probability']
predictions.select(show_cols).show(150)

+---------------+-------+----+-----+--------------------+----------+--------------------+
|followers_count|company|blog|label|       rawPrediction|prediction|         probability|
+---------------+-------+----+-----+--------------------+----------+--------------------+
|              5|      1|   1|  0.0|[-0.2259778441427...|       1.0|[0.44374472970067...|
|            323|      1|   1|  0.0|[-6.1948828136015...|       1.0|[0.00203568974155...|
|              2|      1|   1|  0.0|[-13.875498643090...|       1.0|[9.41774867715869...|
|              1|      0|   1|  0.0|[-0.8823418130133...|       1.0|[0.29269273286275...|
|              2|      0|   1|  0.0|[0.57450494018635...|       0.0|[0.63980201543003...|
|             10|      1|   0|  0.0|[-2.0375311206268...|       1.0|[0.11531836810832...|
|              2|      1|   0|  0.0|[0.45916413216954...|       0.0|[0.61281586633440...|
|              7|      0|   0|  0.0|[-0.7166576010659...|       1.0|[0.32812942614917...|
|         

## Random forest classification

In [17]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)

In [35]:
h.write_tree_to_file(rfModel.toDebugString, 'rf_trees')

In [18]:
predictions = rfModel.transform(test)
predictions.select(show_cols).show(150)

+---------------+-------+----+-----+--------------------+----------+--------------------+
|followers_count|company|blog|label|       rawPrediction|prediction|         probability|
+---------------+-------+----+-----+--------------------+----------+--------------------+
|              5|      1|   1|  0.0|[8.96526209783873...|       1.0|[0.44826310489193...|
|            323|      1|   1|  0.0|[5.03278001418684...|       1.0|[0.25163900070934...|
|              2|      1|   1|  0.0|[3.93451986449267...|       1.0|[0.19672599322463...|
|              1|      0|   1|  0.0|[4.78199056058835...|       1.0|[0.23909952802941...|
|              2|      0|   1|  0.0|[10.8046610716962...|       0.0|[0.54023305358481...|
|             10|      1|   0|  0.0|[3.72629687577363...|       1.0|[0.18631484378868...|
|              2|      1|   0|  0.0|[10.5056619154332...|       0.0|[0.52528309577166...|
|              7|      0|   0|  0.0|[4.84299878604326...|       1.0|[0.24214993930216...|
|         

In [19]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(
    predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.8526045460138268


In [None]:
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")


# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

## Gradient Boost Tree Classifier

In [20]:
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)

In [None]:
h.write_tree_to_file(gbtModel.toDebugString, 'rf_trees')

In [21]:
predictions = gbtModel.transform(test)
predictions.select(show_cols).show(150)

+---------------+-------+----+-----+--------------------+----------+--------------------+
|followers_count|company|blog|label|       rawPrediction|prediction|         probability|
+---------------+-------+----+-----+--------------------+----------+--------------------+
|              5|      1|   1|  0.0|[-0.1918052014867...|       1.0|[0.40525640774073...|
|            323|      1|   1|  0.0|[-0.3078161870138...|       1.0|[0.35077545034572...|
|              2|      1|   1|  0.0|[-1.1867326528397...|       1.0|[0.08521860811378...|
|              1|      0|   1|  0.0|[-0.7011516408096...|       1.0|[0.19745086980872...|
|              2|      0|   1|  0.0|[0.23834207836890...|       0.0|[0.61696458156200...|
|             10|      1|   0|  0.0|[-1.1522087466428...|       1.0|[0.09075776634646...|
|              2|      1|   0|  0.0|[0.39277848513168...|       0.0|[0.68687653069428...|
|              7|      0|   0|  0.0|[-0.5508946786682...|       1.0|[0.24940477289468...|
|         

In [22]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(
    evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.860318519301536


## Grid search with cross validation

In [208]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 60])
             .addGrid(gbt.maxIter, [10, 20])
             .build())
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cvModel = cv.fit(train)
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

KeyboardInterrupt: 