In [1]:
from pyspark.sql.functions import lit, col, udf, avg, max
from pyspark.sql.types import IntegerType, FloatType
from sklearn import svm
from sklearn.model_selection import train_test_split

In [2]:
inputDF = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("datathon_tadata.csv")

In [3]:
# inputDF.createOrReplaceTempView("input_data")

In [4]:
inputDF.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- day: timestamp (nullable = true)
 |-- gender: integer (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_trafficChannel: string (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- daysToCheckin: string (nullable = true)
 |-- osType: integer (nullable = true)
 |-- osTypeName: string (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- p_TotalPrice: string (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)



inputDF.printSchema()

In [5]:
selectDF = inputDF.select("user_id", "gender", "day", "p_sessionActivity", "p_AddToCart", "p_trafficChannel", "p_sessionDuration", "p_pageViews", "daysToCheckin", "osType", "daysFromPreviousVisit", "p_TotalPrice", "isExclusiveMember", "loggedIn", "p_MapInteraction", "BookingPurchase").dropna()
selectDF.printSchema()
# inputDF.filter(inputDF["daysToCheckin"] != "NA").count()
# inputDF.select("user_id").distinct().count()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- day: timestamp (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_trafficChannel: string (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- daysToCheckin: string (nullable = true)
 |-- osType: integer (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- p_TotalPrice: string (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)



In [6]:
avg_price = float(selectDF.filter(col("p_TotalPrice") != "NA").select(max("p_TotalPrice")).take(1)[0][0])
avg_checkin_days = float(selectDF.filter(col("daysToCheckin") != "NA").select(max("daysToCheckin")).take(1)[0][0])

In [7]:
print(avg_price, avg_checkin_days)

# UDF to filter and replace value
def filterNA(cell_val, check_val, replace_val):
    print(cell_val)
    if (cell_val == check_val):
        return replace_val
    else:
        return float(cell_val)

filter_na_df = udf(filterNA, FloatType())
cleanedDF = selectDF \
.withColumn("cleaned_daysToCheckin", filter_na_df("daysToCheckin", lit("NA"), lit(avg_checkin_days))) \
.withColumn("cleaned_totalPrice", filter_na_df("p_TotalPrice", lit("NA"), lit(avg_price))) \
.drop("daysToCheckin", "p_TotalPrice")

9996.0 99.0


In [8]:
cleanedDF.count()

1000000

In [9]:
# inputDF.select(col("daysToCheckin")).distinct().show(inputDF.count())

In [10]:
from pyspark.ml.feature import StringIndexer

cleanedDF.select("p_trafficChannel").show()
indexer = StringIndexer(inputCol="p_trafficChannel", outputCol="trafficChannelIndex")
indexedDF = indexer.fit(cleanedDF).transform(cleanedDF).drop("p_trafficChannel")
# indexedDF.show()

+----------------+
|p_trafficChannel|
+----------------+
|               O|
|               O|
|               O|
|               O|
|               O|
|               A|
|               A|
|               O|
|               H|
|               O|
|               O|
|               O|
|               O|
|               O|
|               O|
|               H|
|               A|
|               O|
|               O|
|               A|
+----------------+
only showing top 20 rows



In [11]:
indexedDF.count()
indexedDF.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- day: timestamp (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- osType: integer (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)
 |-- cleaned_daysToCheckin: float (nullable = true)
 |-- cleaned_totalPrice: float (nullable = true)
 |-- trafficChannelIndex: double (nullable = true)



In [76]:
from datetime import datetime
def getMonth(date):
    return date.month

def checkWeekend(date):
    if date.weekday() in [5, 6]:
        return 1
    else:
        return 0

def combinedFeature(sess_act, days_previous):
    if (sess_act < 1200):
        return 1
    else:
        return 0

combinedFeatureUDF = udf(combinedFeature, IntegerType())

getMonthUDF = udf(getMonth, IntegerType())
checkWeekendUDF = udf(checkWeekend, IntegerType())

#create two new columns
indexedDF = indexedDF.withColumn("month", getMonthUDF("day")).withColumn("weekend", checkWeekendUDF("day")).withColumn("flag1", combinedFeatureUDF("p_sessionActivity", "daysFromPreviousVisit"))
# indexedDF.printSchema()

In [77]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

encoder1 = OneHotEncoder(inputCol="trafficChannelIndex", outputCol="trafficChannelVec")
encodedDF1 = encoder1.transform(indexedDF)
encoder2 = OneHotEncoder(inputCol="osType", outputCol="osTypeVec")
encodedDF2 = encoder2.transform(encodedDF1)
# encodedDF2.show()

In [82]:
from pyspark.ml.feature import VectorAssembler

#assembler with encoders
assembler = VectorAssembler(
    inputCols=["p_sessionActivity", "month", "weekend", "p_sessionDuration", "p_pageViews", "daysFromPreviousVisit", "p_MapInteraction", "cleaned_totalPrice", "cleaned_daysToCheckin", "trafficChannelVec", "osTypeVec"],
    outputCol="features")

#assembler for random forest
assemblerRF = VectorAssembler(
    inputCols=["p_sessionActivity", "month", "weekend", "p_sessionDuration", "p_pageViews", "osType", "daysFromPreviousVisit", "isExclusiveMember", "p_MapInteraction", "trafficChannelIndex", "cleaned_totalPrice", "cleaned_daysToCheckin", "flag1"],
    outputCol="features")

featureDF = assembler.transform(encodedDF2).select("features", col("BookingPurchase").alias("label"))
# featureDF = assemblerRF.transform(indexedDF).select("features", col("BookingPurchase").alias("label"))

# presentDF = featureDF.filter(col("label") == 1).sample(True, 0.3).repartition(3).coalesce(1)
# featureDF = featureDF.union(presentDF)
featureDF.show()
# presentDF

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(30,[0,1,3,4,5,7,...|    0|
|(30,[1,3,4,5,7,8,...|    0|
|(30,[0,1,3,4,7,8,...|    1|
|(30,[1,3,4,5,7,8,...|    0|
|(30,[1,2,3,4,5,7,...|    0|
|(30,[0,1,3,4,5,7,...|    1|
|(30,[0,1,3,4,7,8,...|    0|
|(30,[1,2,3,4,5,7,...|    1|
|(30,[1,3,4,5,7,8,...|    1|
|(30,[0,1,3,4,5,7,...|    0|
|(30,[1,3,4,5,7,8,...|    0|
|(30,[1,2,3,4,5,7,...|    0|
|(30,[0,1,3,4,5,7,...|    0|
|(30,[1,3,4,5,7,8,...|    0|
|(30,[1,3,4,5,7,8,...|    0|
|(30,[1,3,4,5,7,8,...|    1|
|(30,[0,1,2,3,4,5,...|    1|
|(30,[1,2,3,4,5,7,...|    1|
|(30,[0,1,3,4,5,7,...|    1|
|(30,[0,1,3,4,5,7,...|    0|
+--------------------+-----+
only showing top 20 rows



In [83]:
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler
from pyspark.ml.feature import VectorIndexer

# Normalize each Vector using $L^1$ norm.
# normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
# normDF = normalizer.transform(featureDF)
# print("Normalized using L^1 norm")

# normDF = featureDF
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
#                         withStd=True, withMean=False)

normDF = featureDF
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")



# normDF = transformedDF
# scaler = StandardScaler(inputCol="transformedFeatures", outputCol="scaledFeatures",
#                         withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(normDF)

# Normalize each feature to have unit standard deviation.
scaledDF1 = scalerModel.transform(normDF)

scaledDF = scaledDF1.select(col("scaledFeatures").alias("features"), "label")
scaledDF.show(1, truncate=False)
# vindexer = VectorIndexer(inputCol="scaledFeatures", outputCol="indexedFeatures", maxCategories=10)
# indexerModel = vindexer.fit(scaledDF1)

# categoricalFeatures = indexerModel.categoryMaps
# print("Chose %d categorical features: %s" %
#       (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))

# # Create new column "indexed" with categorical values transformed to indices
# indexedData = indexerModel.transform(scaledDF1)
# scaledDF = indexedData
# scaledDF = normDF
# scaledDF.show()

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                                                                                |label|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(30,[0,1,3,4,5,7,8,9,22],[0.031806892473825295,0.18837996266348717,0.007632615014417102,0.0808888606902895,0.4732122249271911,1.028520425828437,2.8310177887019776,2.0297765712573645,2.45956916856625])|0    |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [47]:
from pyspark.ml.feature import ElementwiseProduct
from pyspark.mllib.linalg import Vectors
import numpy as np
# Create some vector data; also works for sparse vectors
import math

numFeatures = 31
arr = [1.0]*numFeatures
arr[0] *= math.pow(10.0, 7)
arr[1] *= math.pow(10.0, 5)
arr[2] *= math.pow(10.0, 5)
arr[3] *= math.pow(10.0, 3)
arr[4] *= math.pow(10.0, 7)
arr[5] *= math.pow(10.0, 6)
arr[6] *= math.pow(10.0, 4)
arr[8] *= math.pow(10.0, 2)
for i in range(9, 14):
    arr[i] *= 4.0
for i in range(14, len(arr)-1):
    arr[i] *= 2.0
arr[len(arr)-1] *= math.pow(10.0, 7)



transformer = ElementwiseProduct(scalingVec=arr,
                                 inputCol="scaledFeatures", outputCol="transformedFeatures")
# Batch transform the vectors to create new column:
transformedDF = transformer.transform(scaledDF1)

from pyspark.ml.feature import MinMaxScaler
mmscaler = MinMaxScaler(inputCol="transformedFeatures", outputCol="scaledFeatures2")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = mmscaler.fit(transformedDF)

# rescale each feature to range [min, max].
scaledDF = scalerModel.transform(transformedDF)
# print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledDF.select(col("scaledFeatures2").alias("features"), "label")

DataFrame[features: vector, label: int]

In [48]:
scaledDF.show(truncate=False)

+------------------------------------------------------------------------------+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                      |label|scaledFeatures                                                                                                                                     

In [None]:
#"p_sessionActivity 0", "month1", "2weekend", "3p_sessionDuration", "4p_pageViews", "5daysFromPreviousVisit", "6p_MapInteraction", "7cleaned_totalPrice", "8cleaned_daysToCheckin", "9trafficChannelVec", "14osTypeVec"

In [None]:
# from pyspark.ml.feature import PCA
# from pyspark.ml.linalg import Vectors

# data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
#         (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
#         (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
# df = spark.createDataFrame(data, ["features"])

# pca = PCA(k=5, inputCol="indexedFeatures", outputCol="pcaFeatures")
# model = pca.fit(scaledDF)

# pcaDF = model.transform(scaledDF).select(col("pcaFeatures").alias("features"), "label")
# pcaDF.show()

In [84]:
X = featureDF.select("features").rdd.map(lambda row: (row[0].toArray())).collect()
y = featureDF.select("label").rdd.map(lambda row: (row[0])).collect()

In [None]:
import numpy as np
input_X = np.array(X)
output_y = np.array(y)

In [None]:
featureDF.count()

1000000

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = svm.SVC()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
y_predict = clf.score(X_test, y_test)

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = svm.SVC(kernel="poly")
clf.fit(X_train, y_train)
y_predict = clf.score(X_test, y_test)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

train, test = scaledDF.randomSplit([0.95, 0.05], seed=1234)

lr = LogisticRegression(maxIter=50)


# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.01]) \
    .addGrid(lr.elasticNetParam, [0.01])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.95)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
tvresult = model.transform(test)
tvresult.select("features", "label", "prediction").show()

def mapRow(row):
    if (row[0] == int(row[1])): 
         return 1
    else:
         return 0
        
numCorrectPredictions = tvresult.select("label", "prediction").rdd.map(lambda row: mapRow(row)).reduce(lambda a, b: a+b)
accuracy = 1.0 * numCorrectPredictions / tvresult.count()
print("Test set accuracy: " + str(accuracy))

In [52]:
from pyspark.ml.classification import LogisticRegression

train, test = scaledDF.randomSplit([0.95, 0.05], seed=1234)

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(train)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (31,[],[])
Intercept: -1.2935078396500554


In [53]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)



objectiveHistory:
0.5208424620326574
+---+---+
|FPR|TPR|
+---+---+
|0.0|0.0|
|1.0|1.0|
|1.0|1.0|
+---+---+

areaUnderROC: 0.5


LogisticRegression_419d8dd08d28d66ea194

In [80]:
#Multi Layer perceptron
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import random

# Split the data into train and test
splits = scaledDF.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [31, 25, 20, 10, 5, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234, stepSize=0.03, solver="l-bfgs", initialWeights=[random.random()]*1597)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
percResult = model.transform(test)
predictionAndLabels = percResult.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Py4JJavaError: An error occurred while calling o1968.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 1130.0 failed 1 times, most recent failure: Lost task 5.0 in stage 1130.0 (TID 4425, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException
	at java.lang.System.arraycopy(Native Method)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:628)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:627)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:627)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:623)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:957)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:948)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:888)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:948)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:694)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1157)
	at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
	at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:142)
	at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:817)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:260)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:145)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:96)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:72)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ArrayIndexOutOfBoundsException
	at java.lang.System.arraycopy(Native Method)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:628)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:627)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:627)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:623)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:957)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:948)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:888)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:948)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:694)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [81]:
# Random Forest
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = scaledDF.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=11, maxDepth=10, subsamplingRate=0.8)

# Train model.  This also runs the indexers.
rfmodel = rf.fit(trainingData)

# Make predictions.
rfpredictions = rfmodel.transform(testData)

# Select example rows to display.
rfpredictions.select("prediction", "label", "features").show(25)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rfpredictions)
print("Test Accuracy = %g" % accuracy)
percResult.filter(col("prediction") == 1.0).count()

# print(metrics.precision())
# print(metrics.confusionMatrix());
# rfModel = rfmodel.stages[2]
# print(rfModel)  # summary only

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    1|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    1|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|


0

In [None]:
rfpredictions.filter(col("prediction") == 1.0).count()

In [72]:
from pyspark.mllib.regression import LabeledPoint

In [71]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics

data = scaledDF.rdd.map(lambda row: LabeledPoint(row[1], row[0].toArray()))


# Split data into training (60%) and test (40%)
training, test = data.randomSplit([0.8, 0.2], seed=11)

# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training)

# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under PR = 0.3112614097659006
Area under ROC = 0.5003751671475785


In [73]:
from pyspark.mllib.evaluation import MulticlassMetrics
met = MulticlassMetrics(predictionAndLabels)
print(met.precision())
print(met.recall())

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/2.1.0/libexec/python/pyspark/ml/wrapper.py", line 76, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
AttributeError: 'MinMaxScaler' object has no attribute '_java_obj'
Exception ignored in: <bound method JavaModelWrapper.__del__ of <pyspark.mllib.evaluation.MulticlassMetrics object at 0x111d530b8>>
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/2.1.0/libexec/python/pyspark/mllib/common.py", line 142, in __del__
    self._sc._gateway.detach(self._java_model)
AttributeError: 'MulticlassMetrics' object has no attribute '_sc'


0.7843512310411649
0.7843512310411649




In [75]:
print(met.accuracy)

0.7843512310411649


In [161]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = pcaDF.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(featuresCol="features", maxIter=50, maxDepth=3, labelCol="label", seed=42)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(20)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g" % accuracy)

# gbtModel = model.stages[1]
# print(gbtModel)  # summary only

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    1|[-39.525479947146...|
|       0.0|    0|[-20.118808547138...|
|       0.0|    0|[-19.711627161750...|
|       0.0|    0|[-16.321780374979...|
|       0.0|    1|[-15.695436420146...|
|       0.0|    0|[-15.134918020774...|
|       0.0|    1|[-15.039741262736...|
|       0.0|    1|[-14.883204768102...|
|       0.0|    1|[-14.294196881815...|
|       0.0|    0|[-14.136479189058...|
|       0.0|    0|[-14.052927001294...|
|       0.0|    0|[-13.697260991265...|
|       0.0|    1|[-13.398598092778...|
|       0.0|    1|[-13.148384474778...|
|       0.0|    1|[-12.530963640443...|
|       0.0|    0|[-12.385137760786...|
|       0.0|    0|[-12.287165404486...|
|       0.0|    0|[-11.988140032253...|
|       0.0|    1|[-11.893476378028...|
|       0.0|    0|[-11.849631540517...|
+----------+-----+--------------------+
only showing top 20 rows

Test Accuracy 

In [162]:
def mapRow(row):
    if (row[0] == int(row[1])): 
         return 1
    else:
         return 0
        
numCorrectPredictions = predictions.select("label", "prediction").rdd.map(lambda row: mapRow(row)).reduce(lambda a, b: a+b)
accuracy = 1.0 * numCorrectPredictions / predictions.count()
print("Test set accuracy: " + str(accuracy))

Test set accuracy: 0.7859978815958645


In [65]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = scaledDF.randomSplit([0.8, 0.2])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(20)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g " % (accuracy))

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
+----------+-----+--------------------+
only showing top 20 rows

Test Accuracy 

IllegalArgumentException: 'MulticlassClassificationEvaluator_490b986d0b596bc4df17 parameter metricName given invalid value precision.'

Test Accuracy = 0.785998 
