In [1]:
from pyspark.sql.functions import lit, col, udf, avg
from pyspark.sql.types import IntegerType, FloatType

In [2]:
inputDF = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("datathon_tadata.csv")

In [3]:
inputDF.createOrReplaceTempView("input_data")

In [4]:
inputDF.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- day: timestamp (nullable = true)
 |-- gender: integer (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_trafficChannel: string (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- daysToCheckin: string (nullable = true)
 |-- osType: integer (nullable = true)
 |-- osTypeName: string (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- p_TotalPrice: string (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)



inputDF.printSchema()

In [5]:
selectDF = inputDF.select("user_id", "gender", "p_sessionActivity", "p_AddToCart", "p_trafficChannel", "p_sessionDuration", "p_pageViews", "daysToCheckin", "osType", "daysFromPreviousVisit", "p_TotalPrice", "isExclusiveMember", "loggedIn", "p_MapInteraction", "BookingPurchase").dropna()
selectDF.printSchema()
# inputDF.filter(inputDF["daysToCheckin"] != "NA").count()
# inputDF.select("user_id").distinct().count()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_trafficChannel: string (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- daysToCheckin: string (nullable = true)
 |-- osType: integer (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- p_TotalPrice: string (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)



In [6]:
avg_price = float(selectDF.select(avg("p_TotalPrice")).take(1)[0][0])
avg_checkin_days = float(selectDF.select(avg("daysToCheckin")).take(1)[0][0])

In [7]:
print(avg_price, avg_checkin_days)

# UDF to filter and replace value
def filterNA(cell_val, check_val, replace_val):
    print(cell_val)
    if (cell_val == check_val):
        return replace_val
    else:
        return float(cell_val)

filter_na_df = udf(filterNA, FloatType())
cleanedDF = selectDF \
.withColumn("cleaned_daysToCheckin", filter_na_df("daysToCheckin", lit("NA"), lit(avg_checkin_days))) \
.withColumn("cleaned_totalPrice", filter_na_df("p_TotalPrice", lit("NA"), lit(avg_price))) \
.drop("daysToCheckin", "p_TotalPrice")

1323.2267780242526 68.29791432633138


In [8]:
cleanedDF.count()

1000000

In [9]:
inputDF.select(col("daysToCheckin")).distinct().show(inputDF.count())

+-------------+
|daysToCheckin|
+-------------+
|          296|
|          125|
|            7|
|           51|
|          124|
|          447|
|          307|
|          169|
|          205|
|          544|
|          272|
|           15|
|           54|
|          232|
|          234|
|          282|
|          383|
|          155|
|          154|
|          132|
|          317|
|          200|
|          388|
|          495|
|           11|
|          101|
|          279|
|          415|
|          433|
|          138|
|          323|
|          351|
|          361|
|          387|
|           29|
|           69|
|          309|
|           42|
|          112|
|           73|
|           87|
|          468|
|           64|
|          308|
|          348|
|          356|
|            3|
|           30|
|          113|
|          432|
|           34|
|          133|
|          287|
|          365|
|          389|
|          162|
|           59|
|          139|
|          146|
|       

In [10]:
from pyspark.ml.feature import StringIndexer

cleanedDF.select("p_trafficChannel").show()
indexer = StringIndexer(inputCol="p_trafficChannel", outputCol="trafficChannelIndex")
indexedDF = indexer.fit(cleanedDF).transform(cleanedDF).drop("p_trafficChannel")
indexedDF.show()

+----------------+
|p_trafficChannel|
+----------------+
|               O|
|               O|
|               O|
|               O|
|               O|
|               A|
|               A|
|               O|
|               H|
|               O|
|               O|
|               O|
|               O|
|               O|
|               O|
|               H|
|               A|
|               O|
|               O|
|               A|
+----------------+
only showing top 20 rows

+-------+------+-----------------+-----------+-----------------+-----------+------+---------------------+-----------------+--------+----------------+---------------+---------------------+------------------+-------------------+
|user_id|gender|p_sessionActivity|p_AddToCart|p_sessionDuration|p_pageViews|osType|daysFromPreviousVisit|isExclusiveMember|loggedIn|p_MapInteraction|BookingPurchase|cleaned_daysToCheckin|cleaned_totalPrice|trafficChannelIndex|
+-------+------+-----------------+-----------+-----------------+

In [11]:
indexedDF.count()
indexedDF.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- osType: integer (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)
 |-- cleaned_daysToCheckin: float (nullable = true)
 |-- cleaned_totalPrice: float (nullable = true)
 |-- trafficChannelIndex: double (nullable = true)



In [12]:
pandaData = indexedDF.toPandas()

In [13]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

encoder1 = OneHotEncoder(inputCol="trafficChannelIndex", outputCol="trafficChannelVec")
encodedDF1 = encoder1.transform(indexedDF)
encoder2 = OneHotEncoder(inputCol="osType", outputCol="osTypeVec")
encodedDF2 = encoder2.transform(encodedDF1)
# encodedDF2.show()

In [14]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["gender", "p_sessionActivity", "p_AddToCart", "p_sessionDuration", "p_pageViews", "osTypeVec", "daysFromPreviousVisit", "isExclusiveMember", "loggedIn", "p_MapInteraction", "trafficChannelVec", "cleaned_totalPrice", "cleaned_daysToCheckin"],
    outputCol="features")

featureDF = assembler.transform(encodedDF2).select("features", col("BookingPurchase").alias("label"))

In [23]:
from pyspark.ml.feature import Normalizer, StandardScaler

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
normDF = normalizer.transform(featureDF)
print("Normalized using L^1 norm")

scaler = StandardScaler(inputCol="normFeatures", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(normDF)

# Normalize each feature to have unit standard deviation.
scaledDF = scalerModel.transform(normDF).select(col("scaledFeatures").alias("features"), "label")
scaledDF.show(truncate=False).map(lambda row: row.get[0].toArray(), )

Normalized using L^1 norm
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                                                                                                                                        |label|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(32,[0,1,3,4,13,21,25,30,31],[1.7061155628897875,0.07621458492484058,0.139341767443334,0.19342423500843678,2.5034385300779474,0.41820058509836877,2.1795779516180187,2.89593571263

In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

train, test = scaledDF.randomSplit([0.8, 0.2], seed=12345)

lr = LogisticRegression(maxIter=10)


# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.3, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 0.8, 1.0])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

In [17]:
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
result = model.transform(test)
result.select("features", "label", "prediction").show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(32,[0,1,2,3,4,6,...|    1|       0.0|
|(32,[0,1,2,3,4,6,...|    0|       0.0|
|(32,[0,1,2,3,4,6,...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    1|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    1|       0.0|
|(32,[0,1,2,3,4,12...|    1|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    1|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    1|       0.0|
|(32,[0,1,2,3,4,12...|    1|       0.0|
|(32,[0,1,2,3,4,12...|    0|       0.0|
|(32,[0,1,2,3,4,12...|    1|       0.0|
+--------------------+-----+----------+
only showing top 20 rows



In [18]:
def mapRow(row):
    if (row[0] == int(row[1])): 
         return 1
    else:
         return 0

In [19]:
numCorrectPredictions = result.select("label", "prediction").rdd.map(lambda row: mapRow(row)).reduce(lambda a, b: a+b)

In [20]:
numCorrectPredictions
accuracy = 1.0 * numCorrectPredictions / result.count()

In [21]:
print("Test set accuracy: " + str(accuracy))

Test set accuracy: 0.7847319393915164


In [24]:
#Multi Layer perceptron
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into train and test
splits = scaledDF.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [32, 20, 10, 7, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
percResult = model.transform(test)
predictionAndLabels = percResult.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.7842442228817234


In [None]:
# Random Forest
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = scaledDF.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=25)

# Train model.  This also runs the indexers.
rfmodel = rf.fit(trainingData)

# Make predictions.
rfpredictions = rfmodel.transform(testData)

# Select example rows to display.
rfpredictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rfpredictions)
print("Test Accuracy = %g" % accuracy)

# rfModel = rfmodel.stages[2]
# print(rfModel)  # summary only

In [32]:
rfpredictions.filter(col("prediction") == 0.0).count()

300323