In [1]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [5]:
# 启动 Spark （如果你已经启动就不需要）
spark = SparkSession.builder.master("local[2]") \
   .appName("test") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext

## 讀取 csv file

In [8]:
!ls -lh ../data/

total 1442808
-rw-r--r--   1 bryan.yang  staff    90B Oct 24 17:03 1
-rw-r--r--@  1 bryan.yang  staff   196M Sep 25 11:20 NASA_access_log_Jul95
-rw-r--r--   1 bryan.yang  staff    11K Sep 25 11:41 NASA_access_log_Jul95_100
-rw-r--r--   1 bryan.yang  staff   188B Oct 24 16:02 json_example.json
-rw-r--r--   1 bryan.yang  staff   162B Oct 24 17:10 json_netested.json
-rw-r--r--@  1 bryan.yang  staff   509M Oct 24 16:42 ratings.csv
drwxr-xr-x  12 bryan.yang  staff   384B Oct 24 16:44 [34mratings.parquet[m[m
-rw-r--r--   1 bryan.yang  staff   2.6K Oct 24 16:02 shakespear.txt
drwxr-xr-x  12 bryan.yang  staff   384B Oct 26 15:20 [34mtest.parquet[m[m


In [9]:
!head ../data/ratings.csv

userId,movieId,rating,timestamp
1,2,3.5,1112486027
1,29,3.5,1112484676
1,32,3.5,1112484819
1,47,3.5,1112484727
1,50,3.5,1112484580
1,112,3.5,1094785740
1,151,4.0,1094785734
1,223,4.0,1112485573
1,253,4.0,1112484940


In [28]:
df_csv = spark.read.csv("../data/ratings.csv", header=True, sep=',', inferSchema=False)

In [29]:
df_csv.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
|     1|    293|   4.0|1112484703|
|     1|    296|   4.0|1112484767|
|     1|    318|   4.0|1112484798|
|     1|    337|   3.5|1094785709|
|     1|    367|   3.5|1112485980|
|     1|    541|   4.0|1112484603|
|     1|    589|   3.5|1112485557|
|     1|    593|   3.5|1112484661|
|     1|    653|   3.0|1094785691|
|     1|    919|   3.5|1094785621|
+------+-------+------+----------+
only showing top 20 rows



In [30]:
df_csv.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [31]:
df_csv.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|         20000263|          20000263|          20000263|            20000263|
|   mean|69045.87258292554| 9041.567330339605|3.5255285642993797|1.1009179216771157E9|
| stddev|40038.62665316201|19789.477445413086| 1.051988919294227|1.6216942478273067E8|
|    min|                1|                 1|               0.5|          1000000065|
|    max|            99999|             99999|               5.0|           999999978|
+-------+-----------------+------------------+------------------+--------------------+



## 讀取 Json

In [32]:
!head ../data/json_example.json

{"userid": '1', "rating": 4, "movieid": '001'}
{"userid": '1', "rating": 3, "movieid": '002'}
{"userid": '2', "movieid": '001', "rating": 4}
{"userid": '2', "movieid": '003', "rating": 2}


In [33]:
df_json = spark.read.json("../data/json_example.json")

In [34]:
df_json.show()

+-------+------+------+
|movieid|rating|userid|
+-------+------+------+
|    001|     4|     1|
|    002|     3|     1|
|    001|     4|     2|
|    003|     2|     2|
+-------+------+------+



In [35]:
df_json.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieid|            rating|            userid|
+-------+------------------+------------------+------------------+
|  count|                 4|                 4|                 4|
|   mean|              1.75|              3.25|               1.5|
| stddev|0.9574271077563381|0.9574271077563381|0.5773502691896257|
|    min|               001|                 2|                 1|
|    max|               003|                 4|                 2|
+-------+------------------+------------------+------------------+



In [84]:
df_json.printSchema()

root
 |-- movieid: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- userid: string (nullable = true)



## 讀取 Parquet

In [87]:
!ls -lh ../data/ratings.parquet/

total 317440
-rw-r--r--  1 bryan.yang  staff     0B Oct 24 16:44 _SUCCESS
-rw-r--r--  1 bryan.yang  staff    40M Oct 24 16:44 part-00000-1e0c22a0-279b-437f-85c0-4009c4b4e16f-c000.snappy.parquet
-rw-r--r--  1 bryan.yang  staff    39M Oct 24 16:44 part-00001-1e0c22a0-279b-437f-85c0-4009c4b4e16f-c000.snappy.parquet
-rw-r--r--  1 bryan.yang  staff    39M Oct 24 16:44 part-00002-1e0c22a0-279b-437f-85c0-4009c4b4e16f-c000.snappy.parquet
-rw-r--r--  1 bryan.yang  staff    37M Oct 24 16:44 part-00003-1e0c22a0-279b-437f-85c0-4009c4b4e16f-c000.snappy.parquet


In [88]:
!head ../data/ratings.parquet/part-00003-1e0c22a0-279b-437f-85c0-4009c4b4e16f-c000.snappy.parquet

PAR1�(��L��  ��$   105343
 4
 5
 6
 7
 8
 9
50
 1
 2


In [93]:
df_parquet = spark.read.parquet("../data/ratings.parquet")

In [94]:
df_parquet.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
|     1|    293|   4.0|1112484703|
|     1|    296|   4.0|1112484767|
|     1|    318|   4.0|1112484798|
|     1|    337|   3.5|1094785709|
|     1|    367|   3.5|1112485980|
|     1|    541|   4.0|1112484603|
|     1|    589|   3.5|1112485557|
|     1|    593|   3.5|1112484661|
|     1|    653|   3.0|1094785691|
|     1|    919|   3.5|1094785621|
+------+-------+------+----------+
only showing top 20 rows



In [95]:
df_parquet.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [96]:
df_parquet.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|         20000263|          20000263|          20000263|            20000263|
|   mean|69045.87258292554| 9041.567330339605|3.5255285642993797|1.1009179216771033E9|
| stddev| 40038.6266531599|19789.477445413315| 1.051988919294247|1.6216942478273004E8|
|    min|                1|                 1|               0.5|          1000000065|
|    max|            99999|             99999|               5.0|           999999978|
+-------+-----------------+------------------+------------------+--------------------+

