<a href="https://colab.research.google.com/github/alfredwisana/big-data_project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('local[*]').appName('myApp').getOrCreate()
import matplotlib.pyplot as plt
import seaborn

In [2]:
# Read the CSV file containing Titanic dataset into Spark's DataFrame and show it

df = spark.read.format('csv').options(header=True, inferSchema=True).load('Clean_Dataset.csv')
df.show(30,truncate=50)

+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|_c0|  airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|  0| SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|  1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|  2|  AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|  3|  Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|  4|  Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
|  5|  Vistara| UK-945|      Del

In [3]:
# Try to print the schema

df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)



In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [5]:
df.select([count(when(isnan(c) | isnull(c),1)).alias(c) for c in df.columns]).show()

+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|_c0|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|  0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+---+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



<h1>Data Preprocessing

In [4]:
from pyspark.ml.feature import Normalizer,VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [5]:
from pyspark.sql.functions import col, max as spark_max, abs as spark_abs

max_abs_value = df.select(spark_max(spark_abs(col("price"))).alias("max_abs_value")).collect()[0]["max_abs_value"]


In [6]:
print(max_abs_value)
len(str(int(max_abs_value)))

123071


6

In [7]:
scaling_factor = 10 ** (len(str(int(max_abs_value)))-1)
df = df.withColumn("normalized_price", col("price") / scaling_factor)
df.show(30,truncate=50)

+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+----------------+
|_c0|  airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|normalized_price|
+---+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+----------------+
|  0| SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|         0.05953|
|  1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|         0.05953|
|  2|  AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|         0.05956|
|  3|  Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|         0.05955|
|  4|  Vistara| UK-963|      Delhi

In [48]:
column_name = df.columns

In [50]:
print(column_name)

['_c0', 'airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class', 'duration', 'days_left', 'price', 'normalized_price']


In [9]:
column_name.remove('_c0')
column_name.remove('price')
column_name.remove('flight')
print(column_name)

['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class', 'duration', 'days_left', 'normalized_price']


In [27]:
final_df = df.select(column_name)


indexers = [StringIndexer(inputCol=col, outputCol=col+'_idx').fit(final_df) for col in ['airline','source_city','departure_time', 'stops', 'arrival_time', 'destination_city','class']]
pipeline = Pipeline(stages=indexers)
final_df = pipeline.fit(final_df).transform(final_df)
final_df.show()

+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+
|  airline|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|normalized_price|airline_idx|source_city_idx|departure_time_idx|stops_idx|arrival_time_idx|destination_city_idx|class_idx|
+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+
| SpiceJet|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1|         0.05953|        5.0|            0.0|               2.0|      1.0|             0.0|                 0.0|      0.0|
| SpiceJet|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        

In [28]:
assembler = VectorAssembler(inputCols=['airline_idx', 'source_city_idx', 'departure_time_idx', 'stops_idx', 'arrival_time_idx', 'destination_city_idx', 'class_idx','duration', 'days_left'], outputCol="features")
final_df = assembler.transform(final_df)
final_df.show(30,truncate=50)

+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+---------------------------------------+
|  airline|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|normalized_price|airline_idx|source_city_idx|departure_time_idx|stops_idx|arrival_time_idx|destination_city_idx|class_idx|                               features|
+---------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+---------------------------------------+
| SpiceJet|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1|         0.05953|        5.0|            0.0|               2.0|      1.0|             0.0|            

In [29]:
# Initialize the StandardScaler
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)

# Fit the scaler to the data
scaler_model = scaler.fit(final_df)

# Transform the data
final_df = scaler_model.transform(final_df)

final_df.show(10,truncate=10)

+--------+-----------+--------------+-----+------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+----------+--------------+
| airline|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|normalized_price|airline_idx|source_city_idx|departure_time_idx|stops_idx|arrival_time_idx|destination_city_idx|class_idx|  features|scaledFeatures|
+--------+-----------+--------------+-----+------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+----------+--------------+
|SpiceJet|      Delhi|       Evening| zero|       Night|          Mumbai|Economy|    2.17|        1|         0.05953|        5.0|            0.0|               2.0|      1.0|             0.0|                 0.0|      0.0|[5.0,0....|    [2.8530...|
|Spi

In [30]:
from pyspark.ml.feature import PCA


pca = PCA(k=2, inputCol="scaledFeatures", outputCol="pcafeatures")
model = pca.fit(final_df)

# Step 5: Transform Data
final_df = model.transform(final_df)


In [31]:
final_df.show(10, truncate=50)

+--------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+--------------------------------------+--------------------------------------------------+------------------------------------------+
| airline|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|normalized_price|airline_idx|source_city_idx|departure_time_idx|stops_idx|arrival_time_idx|destination_city_idx|class_idx|                              features|                                    scaledFeatures|                               pcafeatures|
+--------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+----------------+-----------+---------------+------------------+---------+----------------+--------------------+---------+--------------------------------------+------------------------

In [42]:
feature_vector_df = final_df.select(['features','pcafeatures', 'normalized_price'])
feature_vector_df.show(truncate=False)

+---------------------------------------+------------------------------------------+----------------+
|features                               |pcafeatures                               |normalized_price|
+---------------------------------------+------------------------------------------+----------------+
|[5.0,0.0,2.0,1.0,0.0,0.0,0.0,2.17,1.0] |[-3.1081011449371876,-0.1376955221666359] |0.05953         |
|[5.0,0.0,1.0,1.0,2.0,0.0,0.0,2.33,1.0] |[-3.385823571052269,-0.05359819764616734] |0.05953         |
|[4.0,0.0,1.0,1.0,4.0,0.0,0.0,2.17,1.0] |[-3.291962039575337,-0.047813047475043405]|0.05956         |
|(9,[3,4,7,8],[1.0,3.0,2.25,1.0])       |[-1.3797307554263523,-0.04886572328787585]|0.05955         |
|(9,[3,4,7,8],[1.0,2.0,2.33,1.0])       |[-1.2129029513930918,-0.05904295138764902]|0.05955         |
|(9,[3,4,7,8],[1.0,3.0,2.33,1.0])       |[-1.374581190423971,-0.049780450307802857]|0.05955         |
|(9,[3,4,7,8],[1.0,2.0,2.08,1.0])       |[-1.228995342025533,-0.05618442945037708]

<h3> Regression Model

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator

In [44]:
(trainData, testData) = feature_vector_df.randomSplit([0.8,0.2],seed = 2)

<h5> XGBoost

In [16]:
from pyspark.ml.regression import GBTRegressor
import xgboost as xg
import sklearn
import numpy as np

In [34]:


xgb_r = xg.XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 123).fit(np.vstack(trainData.select("pcafeatures").collect()), np.vstack(trainData.select("normalized_price").collect()))



In [35]:
pred = xgb_r.predict(np.vstack(testData.select("pcafeatures").collect()))
pred

array([0.05316652, 0.05316652, 0.05316652, ..., 0.52852064, 0.49849653,
       0.4916249 ], dtype=float32)

In [36]:
from pyspark.sql.functions import monotonically_increasing_id

test_pred = spark.createDataFrame(pred)

testData_pred = testData.withColumn('key', monotonically_increasing_id())
test_pred = test_pred.withColumn('key', monotonically_increasing_id())

test_pred = test_pred.withColumn("prediction", test_pred["value"])
# Perform the join on the key column
test_with_pred = testData_pred.join(test_pred, on='key')


test_with_pred = test_with_pred.drop('key')

test_with_pred.show(10, truncate=50)

+-----------------------------------------+----------------+-----------+-----------+
|                              pcafeatures|normalized_price|      value| prediction|
+-----------------------------------------+----------------+-----------+-----------+
|[-3.9423080128690144,-0.8388812804140522]|           0.045| 0.05316652| 0.05316652|
|[-3.9049740875864063,-0.8818612040119321]|           0.045| 0.05316652| 0.05316652|
|[-3.8601733772472766,-0.9334371123293881]|         0.04721| 0.05316652| 0.05316652|
| [-3.848973199662494,-0.9463310894087521]|         0.04721| 0.05316652| 0.05316652|
|  [-3.834039629549451,-0.963523058847904]|         0.07425| 0.05316652| 0.05316652|
| [-3.8228394519646685,-0.976417035927268]|         0.07425| 0.05316652| 0.05316652|
|  [-3.789424178590407,-1.153429230614188]|          0.0309|0.049778357|0.049778357|
|  [-3.785690786062146,-1.157727222973976]|          0.0309|0.049778357|0.049778357|
| [-3.7707572159491027,-1.174919192413128]|          0.0309|0.049

In [38]:
evaluator = RegressionEvaluator(labelCol='normalized_price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(test_with_pred)
print ("Root Mean Square of XGBoost model is %g"%rmse)

Root Mean Square of XGBoost model is 0.243531


<h5> Support Vector Regressor

In [24]:
from sklearn.svm import SVR

In [25]:
train_data = trainData.select("pcafeatures", "normalized_price").toPandas()

In [26]:
X = np.array(train_data['pcafeatures'].tolist())
y = train_data['normalized_price'].values

In [27]:
svr_lin = SVR(kernel = 'linear')

In [28]:
svr_model = svr_lin.fit(X,y)

In [None]:
prediction_svr = svr_model.transform(testData)

In [None]:
prediction_svr.show(10, truncate=50)

In [None]:
evaluator = RegressionEvaluator(labelCol='normalized_price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(prediction_svr)
print ("Root Mean Square of Random Forest model is %g"%rmse)

<h5> Random Forest Regressor

In [39]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(featuresCol='pcafeatures',labelCol='normalized_price').fit(trainData)
prediction = rf.transform(testData)

In [40]:
prediction.show(10, truncate=50)

+-----------------------------------------+----------------+-------------------+
|                              pcafeatures|normalized_price|         prediction|
+-----------------------------------------+----------------+-------------------+
|[-3.9423080128690144,-0.8388812804140522]|           0.045|0.07700645375460294|
|[-3.9049740875864063,-0.8818612040119321]|           0.045|0.07700645375460294|
|[-3.8601733772472766,-0.9334371123293881]|         0.04721|0.07700645375460294|
| [-3.848973199662494,-0.9463310894087521]|         0.04721|0.07700645375460294|
|  [-3.834039629549451,-0.963523058847904]|         0.07425|0.07700645375460294|
| [-3.8228394519646685,-0.976417035927268]|         0.07425|0.07700645375460294|
|  [-3.789424178590407,-1.153429230614188]|          0.0309|0.07684992446316603|
|  [-3.785690786062146,-1.157727222973976]|          0.0309|0.07684992446316603|
| [-3.7707572159491027,-1.174919192413128]|          0.0309|0.07684992446316603|
| [-3.744623468251277,-1.205

In [41]:
evaluator = RegressionEvaluator(labelCol='normalized_price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square of Random Forest model is %g"%rmse)

Root Mean Square of Random Forest model is 0.131839


In [45]:
rf = RandomForestRegressor(featuresCol='features',labelCol='normalized_price').fit(trainData)
prediction = rf.transform(testData)

In [46]:
prediction.show(10, truncate=50)

+----------------------------------+------------------------------------------+----------------+-------------------+
|                          features|                               pcafeatures|normalized_price|         prediction|
+----------------------------------+------------------------------------------+----------------+-------------------+
| (9,[0,2,7,8],[1.0,1.0,13.25,6.0])|[0.013102157639335865,0.08219716317632059]|         0.14775|0.11021873978672023|
|(9,[0,2,7,8],[1.0,1.0,13.25,16.0])|[-0.02423176764327223,0.12517708677420053]|         0.07262|0.10607355097259555|
|(9,[0,2,7,8],[1.0,1.0,13.25,28.0])|[-0.06903247798240193,0.17675299509165648]|         0.05281|0.10607355097259555|
|(9,[0,2,7,8],[1.0,1.0,13.25,31.0])|[-0.08023265556718437,0.18964697217102044]|         0.06231|0.10607355097259555|
|(9,[0,2,7,8],[1.0,1.0,13.25,34.0])|[-0.09143283315196679,0.20254094925038443]|         0.06231|0.10607355097259555|
|(9,[0,2,7,8],[1.0,1.0,13.25,37.0])|[-0.10263301073674921,0.2154

In [47]:
evaluator = RegressionEvaluator(labelCol='normalized_price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square of Random Forest model is %g"%rmse)

Root Mean Square of Random Forest model is 0.0662851


In [None]:
spark.stop