In [10]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

spark = SparkSession \
        .builder \
        .appName("my_first_app_name") \
        .getOrCreate()

In [3]:
# 1、从RDD创建
# 1.1、显示指定字段类型
# 生成以逗号分隔的数据
stringCSVRDD = spark.sparkContext.parallelize([
    (123, "Katie", 19, "brown"),
    (234, "Michael", 22, "green"),
    (345, "Simone", 23, "blue")
])

# 指定模式, StructField(name,dataType,nullable)
# 其中：
#   name: 该字段的名字，
#   dataType：该字段的数据类型，
#   nullable: 指示该字段的值是否为空
from pyspark.sql.types import StructType, StructField, LongType, StringType  # 导入类型

schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

# 对RDD应用该模式并且创建DataFrame
swimmers = spark.createDataFrame(stringCSVRDD,schema)

# 利用DataFrame创建一个临时视图
swimmers.registerTempTable("swimmers")

# 查看DataFrame的行数
print(swimmers.count())

tempDF = spark.sql("select * from swimmers")
tempDF.show()

3
+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [4]:
# 1.2、自动类型推断字段类型
data = [(123, "Katie", 19, "brown"),
        (234, "Michael", 22, "green"),
        (345, "Simone", 23, "blue")]
df = spark.createDataFrame(data, schema=['id', 'name', 'age', 'eyccolor'])
df.show()
df.count()

+---+-------+---+--------+
| id|   name|age|eyccolor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



3

In [5]:
# 2、读取 外部文件
# 2.1、JSON
file = r"E:\code\python_workSpace\idea_space\toutiao_project\other\myLearn\data\student.json"
df = spark.read.json(file)
df.show()

+-----------+-----+
|       name|score|
+-----------+-----+
|    xuruyun|  100|
|zhangxueyou|  109|
|    wangfei|   90|
|   liudehua|   80|
+-----------+-----+



In [34]:
# 2.2、csv文件
# 2.2.1、直接读取
file_path = r"E:\code\python_workSpace\idea_space\toutiao_project\other\myLearn\data\test.csv"
monthlySales = spark.read.csv(file_path, header=True, inferSchema=True)
monthlySales.show(3)

+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket|  Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0|330911|7.8292| null|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0|363272|   7.0| null|       S|
|        894|     2|Myles, Mr. Thomas...|  male|62.0|    0|    0|240276|9.6875| null|       Q|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
only showing top 3 rows



In [44]:
# 空值转换为 NoneType 类型
temp_row = monthlySales.take(1)
print(temp_row[0])
print(temp_row[0]["Sex"], type(temp_row[0]["Sex"]), temp_row[0]["Cabin"], type(temp_row[0]["Cabin"]))

Row(PassengerId=892, Pclass=3, Name='Kelly, Mr. James', Sex='male', Age=34.5, SibSp=0, Parch=0, Ticket='330911', Fare=7.8292, Cabin=None, Embarked='Q')
male <class 'str'> None <class 'NoneType'>


In [18]:
# # 2.2.2、从pandas.dataframe创建
def readFile_inputData(train_name=None, test_name=None, index_col=None, dtype=None, parse_dates=None, encoding="UTF-8", sep=','):
    if parse_dates is not None and type(parse_dates) != list:
        raise Exception('parse_dates Type is Error, must list')
    if train_name is not None:
        train = pd.read_csv(filepath_or_buffer=train_name, index_col=index_col, dtype=dtype, parse_dates=parse_dates, encoding=encoding, sep=sep)
    if test_name is not None:
        test = pd.read_csv(filepath_or_buffer=test_name, index_col=index_col, dtype=dtype, parse_dates=parse_dates, encoding=encoding, sep=sep)
        return train, test
    else:
        return train

In [74]:
temp_data = readFile_inputData(file_path)
temp_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [75]:
print(temp_data.loc[0,"Sex"], type(temp_data.loc[0,"Sex"]), temp_data.loc[0,"Cabin"], type(temp_data.loc[0,"Cabin"]))

male <class 'str'> nan <class 'float'>


In [76]:
# 直接转换报错：TypeError: field Cabin: Can not merge type <class 'pyspark.sql.types.DoubleType'> and <class 'pyspark.sql.types.StringType'>
# 因为 Cabin字段：PD中的DataFrame 空值NaN是float类型，有值的是String类型，也就是包含了2种数据类型，所以Spark报错。
# spark_df = spark.createDataFrame(temp_data)

print(temp_data["Cabin"].dtype, type(temp_data.loc[0,"Cabin"]))
temp_data["Cabin"] = temp_data["Cabin"].astype(str) # 所有数据都转换为str类型（浮点型NaN 变为 str类型的字符串nan）
print(temp_data["Cabin"].dtype, type(temp_data.loc[0,"Cabin"]))
spark_df = spark.createDataFrame(temp_data)
spark_df.show(3)

object <class 'float'>
object <class 'str'>
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket|  Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0|330911|7.8292|  nan|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0|363272|   7.0|  nan|       S|
|        894|     2|Myles, Mr. Thomas...|  male|62.0|    0|    0|240276|9.6875|  nan|       Q|
+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+
only showing top 3 rows



In [77]:
# 字符串nan 没有变化
temp1_row = spark_df.take(1)
print(temp1_row[0])
print(temp1_row[0]["Sex"], type(temp1_row[0]["Sex"]), temp1_row[0]["Cabin"], type(temp1_row[0]["Cabin"]))

Row(PassengerId=892, Pclass=3, Name='Kelly, Mr. James', Sex='male', Age=34.5, SibSp=0, Parch=0, Ticket='330911', Fare=7.8292, Cabin='nan', Embarked='Q')
male <class 'str'> nan <class 'str'>


In [None]:
# 2.3、从parquet读取
file=r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\users.parquet"
df=spark.read.parquet(file)
df.show()

In [None]:
# 2.4、从hive读取
spark = SparkSession \
        .builder \
        .enableHiveSupport() \      
        .master("172.31.100.170:7077") \
        .appName("my_first_app_name") \
        .getOrCreate()

df=spark.sql("select * from hive_tb_name")
df.show()

In [None]:
# 2.5、从hdfs读取
# 直接读取，不需要指定ip和port
data= spark.read.csv('hdfs:///tmp/_da_exdata_path/data.csv', header=True)
data.show()

# 有些情况下是需要指定ip和端口的
data= spark.read.csv('hdfs://localhost:9000/tmp/_da_exdata_path/data.csv', header=True)
data.show()

In [None]:
# 3、保存数据
# 3.1、保存到CSV
df = pd.DataFrame(np.random.random((4, 4)),columns=['a', 'b', 'c', 'd'])
spark_df = spark.createDataFrame(df)
file=r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.csv"
spark_df.write.csv(path=file, header=True, sep=",", mode='overwrite')

In [None]:
# 3.2、保存到parquet
df = pd.DataFrame(np.random.random((4, 4)),columns=['a', 'b', 'c', 'd'])
spark_df = spark.createDataFrame(df)
# 写到parquet
file=r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.parquet"
spark_df.write.parquet(path=file,mode='overwrite')

In [None]:
# 3.3、保存到hive
# 打开动态分区
spark.sql("set hive.exec.dynamic.partition.mode = nonstrict")
spark.sql("set hive.exec.dynamic.partition=true")

# 使用普通的hive-sql写入分区表
spark.sql("""
    insert overwrite table ai.da_aipurchase_dailysale_hive 
    partition (saledate) 
    select productid, propertyid, processcenterid, saleplatform, sku, poa, salecount, saledate 
    from szy_aipurchase_tmp_szy_dailysale distribute by saledate
    """)

# 或者使用每次重建分区表的方式
jdbcDF.write.mode("overwrite").partitionBy("saledate").insertInto("ai.da_aipurchase_dailysale_hive")
jdbcDF.write.saveAsTable("ai.da_aipurchase_dailysale_hive", None, "append", partitionBy='saledate')

# 不写分区表，只是简单的导入到hive表
jdbcDF.write.saveAsTable("ai.da_aipurchase_dailysale_for_ema_predict", None, "overwrite", None)

In [None]:
# 3.4、保存到hdfs
# 数据写到hdfs，而且以csv格式保存
jdbcDF.write.mode("overwrite").options(header="true").csv("/home/ai/da/da_aipurchase_dailysale_for_ema_predict.csv")

In [None]:
# 3.5、保存到mysql
# 会自动对齐字段，也就是说，spark_df 的列不一定要全部包含MySQL的表的全部列才行

# overwrite 清空表再导入
spark_df.write.mode("overwrite").format("jdbc").options(
    url='jdbc:mysql://127.0.0.1',
    user='root',
    password='123456',
    dbtable="test.test",
    batchsize="1000",
).save()

# append 追加方式
spark_df.write.mode("append").format("jdbc").options(
    url='jdbc:mysql://127.0.0.1',
    user='root',
    password='123456',
    dbtable="test.test",
    batchsize="1000",
).save()