<a href="https://colab.research.google.com/github/alfredwisana/big-data_project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('local[2]').appName('myApp').getOrCreate()
import matplotlib.pyplot as plt
import seaborn

In [None]:
# Read the CSV file containing Titanic dataset into Spark's DataFrame and show it

df = spark.read.format('csv').options(header=True, inferSchema=True).load('Clean_Dataset.csv')
df.show(30,truncate=50)


+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|_c0|  airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|  0| SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|  1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|  2|  AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|  3|  Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|  4|  Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
|  5|  Vistara| UK-945|      Del

In [None]:
# Try to print the schema

df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)



In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [None]:
df.select([count(when(isnan(c) | isnull(c),1)).alias(c) for c in df.columns]).show()

+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|_c0|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|  0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



<h1>Data Preprocessing

In [None]:
from pyspark.ml.feature import Normalizer,VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [None]:
from pyspark.sql.functions import col, max as spark_max, abs as spark_abs

max_abs_value = df.select(spark_max(spark_abs(col("price"))).alias("max_abs_value")).collect()[0]["max_abs_value"]


In [None]:
print(max_abs_value)
len(str(int(max_abs_value)))

123071


6

In [None]:
scaling_factor = 10 ** (len(str(int(max_abs_value)))-1)
df = df.withColumn("normalized_price", col("price") / scaling_factor)
df.show(30,truncate=50)

+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+----------------+
|_c0|  airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|normalized_price|
+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+----------------+
|  0| SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|         0.05953|
|  1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|         0.05953|
|  2|  AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|         0.05956|
|  3|  Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|         0.05955|
|  4|  Vistara| UK-963|      Delhi

In [None]:
column_name = df.columns

In [None]:
column_name.remove('_c0')
column_name.remove('price')
column_name.remove('flight')
print(column_name)

['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class', 'duration', 'days_left', 'normalized_price']


In [None]:
final_df = df.select(column_name)


indexers = [StringIndexer(inputCol=col, outputCol=col+'_idx').fit(final_df) for col in ['airline','source_city','departure_time', 'stops', 'arrival_time', 'destination_city','class']]
pipeline = Pipeline(stages=indexers)
final_df = pipeline.fit(final_df).transform(final_df)
final_df.show()

+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+
|  airline|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|normalized_price|airline_idx|source_city_idx|departure_time_idx|stops_idx|arrival_time_idx|destination_city_idx|class_idx|
+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+
| SpiceJet|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1|         0.05953|        5.0|            0.0|               2.0|      1.0|             0.0|                 0.0|      0.0|
| SpiceJet|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        

In [None]:
assembler = VectorAssembler(inputCols=['airline_idx', 'source_city_idx', 'departure_time_idx', 'stops_idx', 'arrival_time_idx', 'destination_city_idx', 'class_idx','duration', 'days_left'], outputCol="features")
final_df = assembler.transform(final_df)
final_df.show(30,truncate=50)

+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+---------------------------------------+
|  airline|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|normalized_price|airline_idx|source_city_idx|departure_time_idx|stops_idx|arrival_time_idx|destination_city_idx|class_idx|                               features|
+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+---------------------------------------+
| SpiceJet|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1|         0.05953|        5.0|            0.0|               2.0|      1.0|             0.0|            

In [None]:
feature_vector_df = final_df.select(['features', 'normalized_price'])
feature_vector_df.show(truncate=False)

+---------------------------------------+----------------+
|features                               |normalized_price|
+---------------------------------------+----------------+
|[5.0,0.0,2.0,1.0,0.0,0.0,0.0,2.17,1.0] |0.05953         |
|[5.0,0.0,1.0,1.0,2.0,0.0,0.0,2.33,1.0] |0.05953         |
|[4.0,0.0,1.0,1.0,4.0,0.0,0.0,2.17,1.0] |0.05956         |
|(9,[3,4,7,8],[1.0,3.0,2.25,1.0])       |0.05955         |
|(9,[3,4,7,8],[1.0,2.0,2.33,1.0])       |0.05955         |
|(9,[3,4,7,8],[1.0,3.0,2.33,1.0])       |0.05955         |
|(9,[3,4,7,8],[1.0,2.0,2.08,1.0])       |0.0606          |
|[0.0,0.0,4.0,1.0,1.0,0.0,0.0,2.17,1.0] |0.0606          |
|[3.0,0.0,1.0,1.0,2.0,0.0,0.0,2.17,1.0] |0.05954         |
|[3.0,0.0,4.0,1.0,1.0,0.0,0.0,2.25,1.0] |0.05954         |
|[3.0,0.0,4.0,1.0,1.0,0.0,0.0,2.25,1.0] |0.05954         |
|[3.0,0.0,0.0,1.0,3.0,0.0,0.0,2.33,1.0] |0.05954         |
|[2.0,0.0,1.0,1.0,2.0,0.0,0.0,2.17,1.0] |0.05955         |
|[2.0,0.0,0.0,1.0,3.0,0.0,0.0,2.17,1.0] |0.05955        

<h3> Regression Model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
(trainData, testData) = feature_vector_df.randomSplit([0.8,0.2],seed = 2)

<h5> XGBoost

In [None]:
from pyspark.ml.regression import GBTRegressor



<h5> Support Vector Regressor

In [None]:
from pyspark.ml.regression import  Linear

<h5> Random Forest Regressor

In [None]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(featuresCol='features',labelCol='normalized_price').fit(trainData)
prediction = rf.transform(testData)

In [None]:
prediction.show(10, truncate=50)

+----------------------------------+----------------+-------------------+
|                          features|normalized_price|         prediction|
+----------------------------------+----------------+-------------------+
| (9,[0,2,7,8],[1.0,1.0,13.25,6.0])|         0.14775|0.11882135735388208|
|(9,[0,2,7,8],[1.0,1.0,13.25,16.0])|         0.07262|0.08125145672389493|
|(9,[0,2,7,8],[1.0,1.0,13.25,28.0])|         0.05281|  0.080679872207448|
|(9,[0,2,7,8],[1.0,1.0,13.25,31.0])|         0.06231|  0.080679872207448|
|(9,[0,2,7,8],[1.0,1.0,13.25,34.0])|         0.06231|  0.080679872207448|
|(9,[0,2,7,8],[1.0,1.0,13.25,37.0])|          0.0536|  0.080679872207448|
|(9,[0,2,7,8],[1.0,1.0,13.25,42.0])|         0.04939|  0.080679872207448|
|(9,[0,2,7,8],[1.0,1.0,13.25,43.0])|         0.05702|  0.080679872207448|
|(9,[0,2,7,8],[1.0,1.0,13.67,47.0])|          0.0536|  0.080679872207448|
| (9,[0,2,7,8],[1.0,1.0,14.92,6.0])|          0.1215|0.11882135735388208|
+----------------------------------+--

In [None]:
evaluator = RegressionEvaluator(labelCol='normalized_price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square of Random Forest model is %g"%rmse)

Root Mean Square of Random Forest model is 0.0601662


<h3> Error

In [None]:

# from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
spark.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x00000220DB3C98D0>>