In [2]:
from pyspark.sql.functions import lit, col, udf, avg
from pyspark.sql.types import IntegerType, FloatType

In [3]:
inputDF = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("datathon_tadata.csv")

In [4]:
inputDF.createOrReplaceTempView("input_data")

In [5]:
inputDF.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- day: timestamp (nullable = true)
 |-- gender: integer (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_trafficChannel: string (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- daysToCheckin: string (nullable = true)
 |-- osType: integer (nullable = true)
 |-- osTypeName: string (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- p_TotalPrice: string (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)



inputDF.printSchema()

In [6]:
selectDF = inputDF.select("user_id", "gender", "p_sessionActivity", "p_AddToCart", "p_trafficChannel", "p_sessionDuration", "p_pageViews", "daysToCheckin", "osType", "daysFromPreviousVisit", "p_TotalPrice", "isExclusiveMember", "loggedIn", "p_MapInteraction", "BookingPurchase").dropna()
selectDF.printSchema()
# inputDF.filter(inputDF["daysToCheckin"] != "NA").count()
# inputDF.select("user_id").distinct().count()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_trafficChannel: string (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- daysToCheckin: string (nullable = true)
 |-- osType: integer (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- p_TotalPrice: string (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)



In [7]:
avg_price = float(selectDF.select(avg("p_TotalPrice")).take(1)[0][0])
avg_checkin_days = float(selectDF.select(avg("daysToCheckin")).take(1)[0][0])

In [8]:
print(avg_price, avg_checkin_days)

# UDF to filter and replace value
def filterNA(cell_val, check_val, replace_val):
    print(cell_val)
    if (cell_val == check_val):
        return replace_val
    else:
        return float(cell_val)

filter_na_df = udf(filterNA, FloatType())
cleanedDF = selectDF \
.withColumn("cleaned_daysToCheckin", filter_na_df("daysToCheckin", lit("NA"), lit(avg_checkin_days))) \
.withColumn("cleaned_totalPrice", filter_na_df("p_TotalPrice", lit("NA"), lit(avg_price))) \
.drop("daysToCheckin", "p_TotalPrice")

1323.226778024252 68.29791432633138


In [9]:
cleanedDF.count()

1000000

In [10]:
inputDF.select(col("daysToCheckin")).distinct().show(inputDF.count())

+-------------+
|daysToCheckin|
+-------------+
|          296|
|          125|
|            7|
|           51|
|          124|
|          447|
|          307|
|          169|
|          205|
|          544|
|          272|
|           15|
|           54|
|          232|
|          234|
|          282|
|          383|
|          155|
|          154|
|          132|
|          317|
|          200|
|          388|
|          495|
|           11|
|          101|
|          279|
|          415|
|          433|
|          138|
|          323|
|          351|
|          361|
|          387|
|           29|
|           69|
|          309|
|           42|
|          112|
|           73|
|           87|
|          468|
|           64|
|          308|
|          348|
|          356|
|            3|
|           30|
|          113|
|          432|
|           34|
|          133|
|          287|
|          365|
|          389|
|          162|
|           59|
|          139|
|          146|
|       

In [11]:
from pyspark.ml.feature import StringIndexer

cleanedDF.select("p_trafficChannel").show()
indexer = StringIndexer(inputCol="p_trafficChannel", outputCol="trafficChannelIndex")
indexedDF = indexer.fit(cleanedDF).transform(cleanedDF).drop("p_trafficChannel")
indexedDF.show()

+----------------+
|p_trafficChannel|
+----------------+
|               O|
|               O|
|               O|
|               O|
|               O|
|               A|
|               A|
|               O|
|               H|
|               O|
|               O|
|               O|
|               O|
|               O|
|               O|
|               H|
|               A|
|               O|
|               O|
|               A|
+----------------+
only showing top 20 rows

+-------+------+-----------------+-----------+-----------------+-----------+------+---------------------+-----------------+--------+----------------+---------------+---------------------+------------------+-------------------+
|user_id|gender|p_sessionActivity|p_AddToCart|p_sessionDuration|p_pageViews|osType|daysFromPreviousVisit|isExclusiveMember|loggedIn|p_MapInteraction|BookingPurchase|cleaned_daysToCheckin|cleaned_totalPrice|trafficChannelIndex|
+-------+------+-----------------+-----------+-----------------+

In [12]:
indexedDF.count()
indexedDF.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- p_sessionActivity: integer (nullable = true)
 |-- p_AddToCart: integer (nullable = true)
 |-- p_sessionDuration: integer (nullable = true)
 |-- p_pageViews: integer (nullable = true)
 |-- osType: integer (nullable = true)
 |-- daysFromPreviousVisit: integer (nullable = true)
 |-- isExclusiveMember: integer (nullable = true)
 |-- loggedIn: integer (nullable = true)
 |-- p_MapInteraction: integer (nullable = true)
 |-- BookingPurchase: integer (nullable = true)
 |-- cleaned_daysToCheckin: float (nullable = true)
 |-- cleaned_totalPrice: float (nullable = true)
 |-- trafficChannelIndex: double (nullable = true)



In [13]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

encoder1 = OneHotEncoder(inputCol="trafficChannelIndex", outputCol="trafficChannelVec")
encodedDF1 = encoder1.transform(indexedDF)
encoder2 = OneHotEncoder(inputCol="osType", outputCol="osTypeVec")
encodedDF2 = encoder2.transform(encodedDF1)
# encodedDF2.show()

In [14]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["p_sessionActivity", "p_sessionDuration", "p_pageViews", "osTypeVec", "daysFromPreviousVisit", "isExclusiveMember", "loggedIn", "p_MapInteraction", "trafficChannelVec", "cleaned_totalPrice", "cleaned_daysToCheckin"],
    outputCol="features")

featureDF = assembler.transform(encodedDF2).select("features", col("BookingPurchase").alias("label"))
featureDF.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(30,[0,1,2,11,19,...|    0|
|(30,[1,2,10,19,23...|    0|
|(30,[0,1,2,10,20,...|    1|
|(30,[1,2,10,19,23...|    0|
|(30,[1,2,10,19,23...|    0|
|(30,[0,1,2,10,19,...|    1|
|(30,[0,1,2,10,24,...|    0|
|(30,[1,2,11,19,23...|    1|
|(30,[1,2,13,19,25...|    1|
|(30,[0,1,2,10,19,...|    0|
|(30,[1,2,10,19,23...|    0|
|(30,[1,2,10,19,23...|    0|
|(30,[0,1,2,10,19,...|    0|
|(30,[1,2,16,19,23...|    0|
|(30,[1,2,11,19,23...|    0|
|(30,[1,2,10,19,21...|    1|
|(30,[0,1,2,13,19,...|    1|
|(30,[1,2,11,19,23...|    1|
|(30,[0,1,2,10,19,...|    1|
|(30,[0,1,2,13,19,...|    0|
+--------------------+-----+
only showing top 20 rows



In [15]:
#X = featureDF.select("features").rdd.map(lambda row: (row[0].toArray())).collect()
#y = featureDF.select("label").rdd.map(lambda row: (row[0])).collect()

In [37]:
presentDF = featureDF.filter(col("label") == 1)

In [55]:
featureDF = featureDF.union(presentDF)
# .union(presentDF).union(presentDF)

In [47]:
#input_X.shape
df = featureDF.toPandas()

In [28]:
#import numpy as np
#input_X = np.array(X)
#output_y = np.array(y)
np_mat = df.as_matrix()

In [33]:
df.values.shape

(1000000, 2)

In [18]:
df = featureDF.toPandas()

label_yes = df['label'] == 1
df_yes = df[label_yes]
df_oversampled = df.append([df_yes] * 4, ignore_index=True)
df_oversampled.shape

(1861360, 2)

In [26]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
Y = np.array(df_oversampled['label'])
print(Y)
X = np.array(df_oversampled[df_oversampled.columns[:-1]])
X.shape
#dt = DecisionTreeClassifier(random_state=0)
#dt.fit(X, Y)
#scores = cross_val_score(dt, X, Y, cv=10)
#scores

[0 0 1 ..., 1 1 1]


(1861360, 1)

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = svm.SVC()
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_predict = clf.predict(X_test)
accuracy_score(y_test, t_predict)

In [56]:
from pyspark.ml.feature import Normalizer, StandardScaler

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
normDF = normalizer.transform(featureDF)
print("Normalized using L^1 norm")

scaler = StandardScaler(inputCol="normFeatures", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(normDF)

# Normalize each feature to have unit standard deviation.
scaledDF = scalerModel.transform(normDF).select(col("scaledFeatures").alias("features"), "label")
# scaledDF.show(truncate=False).map(lambda row: row.get[0].toArray(), )

Normalized using L^1 norm


In [65]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

train, test = scaledDF.randomSplit([0.9, 0.1], seed=12345)

lr = LogisticRegression(maxIter=50)


# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.3, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.01, 0.1, 0.5, 0.8, 1.0])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.7)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

In [58]:
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
tvresult = model.transform(test)
tvresult.select("features", "label", "prediction").show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(30,[0,1,2,4,19,2...|    1|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    1|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    1|       1.0|
|(30,[0,1,2,4,19,2...|    1|       1.0|
|(30,[0,1,2,4,19,2...|    1|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    0|       1.0|
|(30,[0,1,2,4,19,2...|    1|       1.0|
|(30,[0,1,2,4,19,2...|    1|       1.0|
+--------------------+-----+----------+
only showing top 20 rows



In [59]:
def mapRow(row):
    if (row[0] == int(row[1])): 
         return 1
    else:
         return 0

In [60]:
numCorrectPredictions = tvresult.select("label", "prediction").rdd.map(lambda row: mapRow(row)).reduce(lambda a, b: a+b)

In [61]:
numCorrectPredictions
accuracy = 1.0 * numCorrectPredictions / tvresult.count()

In [62]:
print("Test set accuracy: " + str(accuracy))

Test set accuracy: 0.5873891805491167


In [48]:
#Multi Layer perceptron
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into train and test
splits = scaledDF.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [30, 25, 20, 10, 7, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
percResult = model.transform(test)
predictionAndLabels = percResult.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.7837440394811431


In [68]:
# Random Forest
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into training and test sets (30% held out for testing)
#(trainingData, testData) = scaledDF.randomSplit([0.7, 0.3])
trainingData = scaledDF
testData = scaledDF
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=25)

# Train model.  This also runs the indexers.
rfmodel = rf.fit(trainingData)

# Make predictions.
rfpredictions = rfmodel.transform(testData)

# Select example rows to display.
rfpredictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rfpredictions)
print("Test Accuracy = %g" % accuracy)

# rfModel = rfmodel.stages[2]
# print(rfModel)  # summary only

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|    0|(30,[0,1,2,11,19,...|
|       1.0|    0|(30,[1,2,10,19,23...|
|       1.0|    1|(30,[0,1,2,10,20,...|
|       1.0|    0|(30,[1,2,10,19,23...|
|       1.0|    0|(30,[1,2,10,19,23...|
+----------+-----+--------------------+
only showing top 5 rows

Test Accuracy = 0.590388


In [70]:
rfpredictions.filter(col("prediction") == 1.0).count()

1753476

In [54]:
rfpredictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(30,[0,1,2,4,19,2...|    1|[9.34620323199338...|[0.37384812927973...|       1.0|
|(30,[0,1,2,4,19,2...|    1|[9.21128345186936...|[0.36845133807477...|       1.0|
|(30,[0,1,2,4,19,2...|    0|[9.13431756328675...|[0.36537270253147...|       1.0|
|(30,[0,1,2,4,19,2...|    1|[9.17334883731972...|[0.36693395349278...|       1.0|
|(30,[0,1,2,4,19,2...|    0|[9.46915528655994...|[0.37876621146239...|       1.0|
|(30,[0,1,2,4,19,2...|    1|[9.35181320488357...|[0.37407252819534...|       1.0|
|(30,[0,1,2,4,19,2...|    0|[9.51067735192804...|[0.38042709407712...|       1.0|
|(30,[0,1,2,4,19,2...|    1|[9.72069373616058...|[0.38882774944642...|       1.0|
|(30,[0,1,2,4,19,2...|    0|[9.87103413617538...|[0.39484136544701...|       1.0|
|(30,[0,1,2,4,19

In [67]:
tvresult.filter(col("prediction") == 0.0).count()
#tvresult.show()

29828