In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
   .appName("Neural Network Model") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext

In [6]:
sc

## **1. ธุรกิจต้องการได้รับ Machine Learning Model ที่ทำให้ทราบล่วงหน้าว่า ลูกค้าที่กำลังเลือกดูสินค้าอยู่ จะซื้อหรือไม่ซื้อสินค้าหรือไม่ (Binary Classification)**

# **2. Data Understanding**

In [7]:
! wget https://storage.googleapis.com/class25jan2022/share/testfunnel.csv

--2022-02-11 05:18:35--  https://storage.googleapis.com/class25jan2022/share/testfunnel.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.216.128, 173.194.217.128, 173.194.218.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.216.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1501012 (1.4M) [text/csv]
Saving to: ‘testfunnel.csv’


2022-02-11 05:18:35 (120 MB/s) - ‘testfunnel.csv’ saved [1501012/1501012]



In [8]:
! head -3 testfunnel.csv

source,isTrueDirect,sourceKeyword,medium,isVideoAd,fullVisitorId,visitId,date,newVisits,hitReferer,hitType,hitAction_type,hitNumber,hitHour,hitMin,timeMicroSec,v2ProductName,productListName,isClick,isImpression,sessionQualityDim,timeOnScreen,timeOnSite,totalTransactionRevenue
google,true,(not provided),organic,,4988612949713423910,1489617360,20170315,,,PAGE,0,28,15,46,608771,Google Accent Insulated Stainless Steel Bottle,Category,,true,,,1897,
google,true,(not provided),organic,,4988612949713423910,1489617360,20170315,,,PAGE,0,27,15,46,605064,Google Pocket Bluetooth Speaker,Category,,true,,,1897,


In [9]:
! wc -l testfunnel.csv

10000 testfunnel.csv


In [10]:
raw_df = spark.read.option('header','true')\
.option("inferSchema" , "true")\
.format('csv').load('./testfunnel.csv')

In [11]:
raw_df.count()

9999

In [12]:
len(raw_df.columns)

24

In [13]:
raw_df.sample(0.001).toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6
source,(direct),(direct),(direct),google,youtube.com,(direct),(direct)
isTrueDirect,True,,True,,,,True
sourceKeyword,,,,(not provided),,,
medium,(none),(none),(none),organic,referral,(none),(none)
isVideoAd,,,,,,,
fullVisitorId,4988857276718887102,4990198874063509353,4993888514053304960,4994248346633828032,4995280515010061307,4995845223254722667,4995899860057743088
visitId,1496382237,1477021389,1492820576,1488049502,1479316504,1486823509,1474047117
date,20170601,20161020,20170421,20170225,20161116,20170211,20160916
newVisits,1.0,1.0,1.0,1.0,1.0,1.0,
hitReferer,,,,https://www.google.cz/,https://www.youtube.com/yt/about/,,


In [None]:
raw_df.describe().toPandas().transpose()

In [None]:
raw_df.printSchema()

In [None]:
raw_df.groupBy('fullVisitorId').count().show()

In [None]:
raw_df.groupBy('isTrueDirect').count().show()

In [None]:
from pyspark.sql import functions as sparkf

In [None]:
selectedCol_df = raw_df\
.drop('isVideoAd').drop('isClick').drop('isImpression')\
.drop('sourceKeyword').drop('hitReferer').drop('timeOnScreen').drop('sessionQualityDim')

In [None]:
selectedCol_df.count()

In [None]:
len(selectedCol_df.columns)

In [None]:
selectedCol_df.printSchema()

Data Preparation

In [None]:
from pyspark.sql.types import *

In [None]:
sparkf_NulltoFalse = sparkf.udf(lambda x: 'false' if x is None else x)

In [None]:
selectedCol_df.count()

จัดการค่า Null

In [None]:
selectedCol_df.withColumn('isTrueDirect',\
                          sparkf.col('isTrueDirect').cast(StringType()))\
.withColumn('isTrueDirect',\
           sparkf_NulltoFalse(sparkf.col('isTrueDirect')))\
.groupBy('isTrueDirect').count().show()

In [None]:
sparkf_treatNewVisits = sparkf.udf(lambda x: 'true' if x == '1' else 'false')

In [None]:
selectedCol_df.withColumn('newVisits',\
                          sparkf.col('newVisits').cast(StringType()))\
.withColumn('newVisits',sparkf_treatNewVisits(sparkf.col('newVisits')))\
.groupBy('newVisits').count().show()

In [None]:
sparkf_treatRevenue = sparkf.udf(lambda x: 0 if x == None else x)

In [None]:
selectedCol_df.printSchema()

In [None]:
sparkf_createLabel = sparkf.udf(lambda x: 'buy' if x>0 else 'notBuy')

In [None]:
crunched_df = selectedCol_df\
.withColumn('isTrueDirect',sparkf.col('isTrueDirect').cast(StringType()))\
.withColumn('isTrueDirect',sparkf_NulltoFalse(sparkf.col('isTrueDirect')))\
.withColumn('newVisits',sparkf.col('newVisits').cast(StringType()))\
.withColumn('newVisits',sparkf_treatNewVisits(sparkf.col('newVisits')))\
.withColumn('totalTransactionRevenue',sparkf_treatRevenue(sparkf.col('totalTransactionRevenue')))\
.withColumn('totalTransactionRevenue',sparkf.col('totalTransactionRevenue').cast(DoubleType()))\
.withColumn('label',sparkf_createLabel(sparkf.col('totalTransactionRevenue')))

In [None]:
crunched_df.printSchema()

In [None]:
crunched_df.show(100)

In [None]:
crunched_df.describe()

In [None]:
crunched_df.describe().toPandas().transpose()

In [None]:
crunched_df.dropna().count()

In [None]:
final_df = crunched_df.dropna()

In [None]:
final_df.describe().show()

In [None]:
final_df.describe().toPandas().transpose()

In [None]:
final_df.show()

In [None]:
final_df.groupBy('label').count().show()