In [1]:
import findspark
findspark.init("/opt/manual/spark/")

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("case_study_3") \
.master("yarn") \
.config("spark.executor.memory","2g") \
.config("spark.driver.memory", "1g") \
.enableHiveSupport() \
.getOrCreate()

In [10]:
data = spark.read \
.option("sep",",") \
.option("inferSchema", True) \
.option("header",True) \
.csv("/user/train/datasets/thy_data.txt")

In [12]:
data.limit(5).toPandas()

Unnamed: 0,SEASON,ORIGIN,DESTINATION,PSGR_COUNT
0,SUMMER,227,YYZ,6
1,SUMMER,224,222,3
2,SUMMER,226,JF8,1
3,SUMMER,227,3RG,11
4,SUMMER,227,6RV,245


In [13]:
data.count()

1719202

In [15]:
len(data.columns)

4

In [16]:
data.printSchema()

root
 |-- SEASON: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DESTINATION: string (nullable = true)
 |-- PSGR_COUNT: integer (nullable = true)



In [17]:
data.select("ORIGIN").distinct().count()

4210

In [21]:
data.select("SEASON","PSGR_COUNT") \
.groupBy("SEASON") \
.agg(F.sum("PSGR_COUNT").alias("TOTAL_PSGR_COUNT"),
     F.avg("PSGR_COUNT").alias("AVG_PSGR_COUNT"),
     F.count("PSGR_COUNT").alias("COUNT")) \
.show(truncate=False)

+------+----------------+-----------------+------+
|SEASON|TOTAL_PSGR_COUNT|AVG_PSGR_COUNT   |COUNT |
+------+----------------+-----------------+------+
|WINTER|336882094       |420.1829420855103|801751|
|SUMMER|521002675       |567.8806552066541|917451|
+------+----------------+-----------------+------+



In [31]:
data.select("ORIGIN","PSGR_COUNT") \
.filter("SEASON == 'SUMMER'") \
.groupBy("ORIGIN") \
.agg(F.sum("PSGR_COUNT").alias("TOTAL_PSGR_COUNT")) \
.orderBy(F.desc("TOTAL_PSGR_COUNT")) \
.limit(5).toPandas()

Unnamed: 0,ORIGIN,TOTAL_PSGR_COUNT
0,IC7,11177363
1,LHR,9696224
2,H8G,8432456
3,DEL,7705173
4,CDG,7244943


In [32]:
ordered_data = data.groupBy("SEASON","ORIGIN") \
.agg(F.sum("PSGR_COUNT").alias("TOTAL_PSGR_COUNT")) \
.orderBy(F.desc("TOTAL_PSGR_COUNT")) 

In [33]:
ordered_data.limit(5).toPandas()

Unnamed: 0,SEASON,ORIGIN,TOTAL_PSGR_COUNT
0,SUMMER,IC7,11177363
1,SUMMER,LHR,9696224
2,SUMMER,H8G,8432456
3,WINTER,IC7,7803139
4,SUMMER,DEL,7705173


In [34]:
ordered_data.write \
.format("ORC") \
.mode("overwrite") \
.saveAsTable("test1.ordered_thy_data")

In [36]:
spark.sql("show databases;").show()

+---------+
|namespace|
+---------+
|bookstore|
|  default|
|movielens|
|    test1|
+---------+



In [39]:
spark.sql("select * from test1.ordered_thy_data limit 5;").show()

+------+------+----------------+
|SEASON|ORIGIN|TOTAL_PSGR_COUNT|
+------+------+----------------+
|WINTER|   YLP|               1|
|WINTER|   27Z|               1|
|WINTER|   7O4|               1|
|WINTER|   3VI|               1|
|WINTER|   4YD|               1|
+------+------+----------------+



In [53]:
jdbcUrl = "jdbc:postgresql://localhost/traindb?user=train&password=Ankara06"

In [57]:
ordered_data.write \
.mode("overwrite") \
.jdbc(url=jdbcUrl,
      table="ordered_thy_data",
      mode="overwrite",
      properties=({"driver":"org.postgresql.Driver"}))

In [58]:
spark.read \
.jdbc(url=jdbcUrl,table="ordered_thy_data",properties={"driver":"org.postgresql.Driver"}).limit(5).toPandas()

Unnamed: 0,SEASON,ORIGIN,TOTAL_PSGR_COUNT
0,SUMMER,IC7,11177363
1,SUMMER,LHR,9696224
2,SUMMER,H8G,8432456
3,WINTER,IC7,7803139
4,SUMMER,DEL,7705173


In [59]:
spark.stop()