In [2]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [3]:
# 启动 Spark （如果你已经启动就不需要）
spark = SparkSession.builder.master("local[2]") \
   .appName("test") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext

## 讀取 csv file

In [4]:
df_csv = spark.read.csv("../data/ratings.csv", header=True)

In [5]:
df_csv.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
|     1|    293|   4.0|1112484703|
|     1|    296|   4.0|1112484767|
|     1|    318|   4.0|1112484798|
|     1|    337|   3.5|1094785709|
|     1|    367|   3.5|1112485980|
|     1|    541|   4.0|1112484603|
|     1|    589|   3.5|1112485557|
|     1|    593|   3.5|1112484661|
|     1|    653|   3.0|1094785691|
|     1|    919|   3.5|1094785621|
+------+-------+------+----------+
only showing top 20 rows



In [6]:
df_csv.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [9]:
df_csv.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|         20000263|          20000263|          20000263|            20000263|
|   mean|69045.87258292554| 9041.567330339605|3.5255285642993797|1.1009179216771157E9|
| stddev|40038.62665316201|19789.477445413086| 1.051988919294227|1.6216942478273067E8|
|    min|                1|                 1|               0.5|          1000000065|
|    max|            99999|             99999|               5.0|           999999978|
+-------+-----------------+------------------+------------------+--------------------+



## 讀取 Json

In [10]:
df_json = spark.read.json("../data/json_example.json")

In [11]:
df_json.show()

+-------+------+------+
|movieid|rating|userid|
+-------+------+------+
|    001|     4|     1|
|    002|     3|     1|
|    001|     4|     2|
|    003|     2|     2|
+-------+------+------+



In [12]:
df_json.printSchema()

root
 |-- movieid: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- userid: string (nullable = true)



In [13]:
df_json.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieid|            rating|            userid|
+-------+------------------+------------------+------------------+
|  count|                 4|                 4|                 4|
|   mean|              1.75|              3.25|               1.5|
| stddev|0.9574271077563381|0.9574271077563381|0.5773502691896257|
|    min|               001|                 2|                 1|
|    max|               003|                 4|                 2|
+-------+------------------+------------------+------------------+



## 讀取 Parquet

In [14]:
df_parquet = spark.read.parquet("../data/ratings.parquet")

In [15]:
df_parquet.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
|     1|    293|   4.0|1112484703|
|     1|    296|   4.0|1112484767|
|     1|    318|   4.0|1112484798|
|     1|    337|   3.5|1094785709|
|     1|    367|   3.5|1112485980|
|     1|    541|   4.0|1112484603|
|     1|    589|   3.5|1112485557|
|     1|    593|   3.5|1112484661|
|     1|    653|   3.0|1094785691|
|     1|    919|   3.5|1094785621|
+------+-------+------+----------+
only showing top 20 rows



In [16]:
df_parquet.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [17]:
df_parquet.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|         20000263|          20000263|          20000263|            20000263|
|   mean|69045.87258292554| 9041.567330339605|3.5255285642993797|1.1009179216771033E9|
| stddev| 40038.6266531599|19789.477445413315| 1.051988919294247|1.6216942478273004E8|
|    min|                1|                 1|               0.5|          1000000065|
|    max|            99999|             99999|               5.0|           999999978|
+-------+-----------------+------------------+------------------+--------------------+



## 效能比较

In [116]:
df_parquet.count()

20000263

In [117]:
df_csv.count()

20000263

In [118]:
%%timeit

df_parquet.count()

55.1 ms ± 8.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [119]:
%%timeit

df_csv.count()

21.9 s ± 164 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
