In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext()

## 直接读取本地数据

In [15]:
rdd = sc.textFile("./data/eq2013.csv")

In [16]:
rdd.take(5)

['date,time,x,y,z,lv,M,v1,v2,v3,v4',
 '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',
 '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']

In [17]:
rdd.count()

8205

## 通过HDFS读取数据

In [12]:
rdd2 = sc.textFile("hdfs://sparkvm.com:8020/data/eq/eq2013.csv")

In [13]:
rdd2.take(5)

['date,time,x,y,z,lv,M,v1,v2,v3,v4',
 '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',
 '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']

In [14]:
rdd2.count()

8205

## 通过Python自行读取进行序列化

In [18]:
import csv

In [35]:
eq = [row[:-1] for row in open("./data/eq2013.csv")]

In [36]:
eq[:5]

['date,time,x,y,z,lv,M,v1,v2,v3,v4',
 '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',
 '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']

In [37]:
rdd3 = sc.parallelize(eq)

In [38]:
rdd3.take(5)

['date,time,x,y,z,lv,M,v1,v2,v3,v4',
 '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',
 '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',
 '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']

In [39]:
rdd3.count()

8205

## 通过JDBC读取数据库转为RDD

### Spark主要通过spark.sql来实现数据库获取

### 首先把Postgresql的JDBC包，导入到环境变量中

In [41]:
import os

In [49]:
os.environ['SPARK_CLASSPATH']=\
"$SPARK_CLASSPATH;{0}/data/postgresql-42.2.4.jar".format(os.getcwd())

### PostgreSQL的JDBC连接字符串

In [51]:
url="jdbc:postgresql://sparkvm.com:5432/postgis?user=postgres&password=postgres"

In [52]:
import pyspark.sql

In [53]:
pyssql = pyspark.sql.SQLContext(sc)

### 通过SQLContext来获取数据，需要设置读取模式、URL和读取的表格

In [58]:
df1 = pyssql.read.format("jdbc").\
option("url", url).option("dbtable","china").load()

### 显示前面5行

In [60]:
df1.show(5)

+---+--------------------+----------+------+------------+--------+--------+--------+--------+--------+---------+----------+----------+--------+--------+--------+--------+--------+--------+----------+
| id|                geom|first_name|  code|        area|pop_2009|pop_2005|pop_2000|pop_1999|pop_1995| pop_1990|pop_birth_|pop_death_|gdp_2009|gdp_2008|gdp_2007|gdp_2006|gdp_2005|cpi_2009|categories|
+---+--------------------+----------+------+------------+--------+--------+--------+--------+--------+---------+----------+----------+--------+--------+--------+--------+--------+--------+----------+
|  1|0106000020E610000...|        北京|110000| 1.634556E10|  1755.0|  1538.0|  1382.0|  1257.0|  1251.0|1081.9407|      8.06|      5.12|12153.03| 11115.0| 9846.81| 8117.78| 6969.52| 22154.0|       1.0|
|  2|0106000020E610000...|        天津|120000|1.1660963E10| 1228.16|  1043.0|  1001.0|   959.0|   942.0| 878.5402|       8.3|      6.23| 7521.85| 6719.01| 5252.76| 4462.74| 3905.64| 15149.0|       1.0|
