In [71]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('my_first_app_name') \
    .getOrCreate()

In [18]:
# 从pandas dataframe创建spark dataframe
colors = ['white','green','yellow','red','brown','pink']
color_df = pd.DataFrame(colors,columns=['color'])
color_df['length'] = color_df['color'].apply(len)
print(color_df)

color_sdf = spark.createDataFrame(color_df)
color_sdf.show()

    color  length
0   white       5
1   green       5
2  yellow       6
3     red       3
4   brown       5
5    pink       4
+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
+------+------+



In [12]:
print(color_df.dtypes)
print(color_sdf.dtypes)

color     object
length     int64
dtype: object
[('color', 'string'), ('length', 'bigint')]


In [17]:
print(color_df.shape, len(color_df))
print(color_sdf.count())

(6, 2) 6
6


In [20]:
# 1、重命名列名
# 1.1、PD
color_df.rename(columns={"color":"new_color"}, inplace=True)

In [30]:
# 1.2、spark
# spark-1
# 在创建dataframe的时候重命名
data = spark.createDataFrame(data=[("Alberto", 2), ("Dakota", 2)],
                              schema=['name','length'])
data.show()
data.printSchema()

# spark-2
# 使用selectExpr方法
color_sdf2 = color_sdf.selectExpr('color as color2','length as length2')
color_sdf2.show()

# spark-3
# withColumnRenamed方法
color_sdf3 = color_sdf.withColumnRenamed('color','color2')\
                    .withColumnRenamed('length','length2')
color_sdf3.show()

# spark-4
# alias 方法，没发现有什么意义
color_sdf.select(color_sdf.color.alias('color2')).show() 

+-------+------+
|   name|length|
+-------+------+
|Alberto|     2|
| Dakota|     2|
+-------+------+

root
 |-- name: string (nullable = true)
 |-- length: long (nullable = true)

+------+-------+
|color2|length2|
+------+-------+
| white|      5|
| green|      5|
|yellow|      6|
|   red|      3|
| brown|      5|
|  pink|      4|
+------+-------+

+------+-------+
|color2|length2|
+------+-------+
| white|      5|
| green|      5|
|yellow|      6|
|   red|      3|
| brown|      5|
|  pink|      4|
+------+-------+

+------+
|color2|
+------+
| white|
| green|
|yellow|
|   red|
| brown|
|  pink|
+------+



In [39]:
# 2、选择和切片筛选
color_sdf.select("length").show() # color_sdf.select(color_sdf.length).show()
color_sdf.filter(color_sdf['length']>=4).show()   # filter方法
color_sdf.filter(color_sdf['length']>=4).filter(color_sdf['color']=='white').show()

+------+
|length|
+------+
|     5|
|     5|
|     6|
|     3|
|     5|
|     4|
+------+

+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
| brown|     5|
|  pink|     4|
+------+------+

+-----+------+
|color|length|
+-----+------+
|white|     5|
+-----+------+



AttributeError: 'DataFrame' object has no attribute 'color'

In [43]:
color_sdf.filter(color_sdf.length.between(4,5) )\
        .select(color_sdf.color.alias('mid_length')).show()
color_sdf.filter("color='green'").show()
color_sdf.filter("color like 'b%'").show()

+----------+
|mid_length|
+----------+
|     white|
|     green|
|     brown|
|      pink|
+----------+

+-----+------+
|color|length|
+-----+------+
|green|     5|
+-----+------+

+-----+------+
|color|length|
+-----+------+
|brown|     5|
+-----+------+



In [45]:
color_sdf.where("color like '%yellow%'").show()

+------+------+
| color|length|
+------+------+
|yellow|     6|
+------+------+



In [46]:
color_sdf.createOrReplaceTempView("color_df")
spark.sql("select count(1) from color_df").show()

+--------+
|count(1)|
+--------+
|       6|
+--------+



In [48]:
# 3、删除一列
color_sdf.drop('length').show() # 不作用于原数据

print(color_df.drop(labels=['length'],axis=1)) # 不作用于原数据

+------+
| color|
+------+
| white|
| green|
|yellow|
|   red|
| brown|
|  pink|
+------+

  new_color
0     white
1     green
2    yellow
3       red
4     brown
5      pink


In [52]:
from pyspark.sql.functions import lit
color_sdf = color_sdf.withColumn('newCol', lit(0)) # 需重新赋值
color_sdf.show()

color_df["sex"] = 0
print(color_df)

+------+------+------+
| color|length|newCol|
+------+------+------+
| white|     5|     0|
| green|     5|     0|
|yellow|     6|     0|
|   red|     3|     0|
| brown|     5|     0|
|  pink|     4|     0|
+------+------+------+

  new_color  length  sex
0     white       5    0
1     green       5    0
2    yellow       6    0
3       red       3    0
4     brown       5    0
5      pink       4    0


In [56]:
# 转JSON
print(color_sdf.toJSON().first())
print(color_df.to_json())

{"color":"white","length":5,"newCol":0}
{"new_color":{"0":"white","1":"green","2":"yellow","3":"red","4":"brown","5":"pink"},"length":{"0":5,"1":5,"2":6,"3":3,"4":5,"5":4},"sex":{"0":0,"1":0,"2":0,"3":0,"4":0,"5":0}}


In [69]:
# pandas的排序
color_df = color_df.sort_values(by='length', ascending=False)
print(color_df)


# spark排序
color_sdf.sort('color',ascending=False).show()

# 多字段排序 .filter(color_df['length']>=4)\
color_sdf.filter(color_sdf['length']>=4).sort('length', 'color', ascending=False).show() 

# 混合排序
color_sdf.sort(color_sdf.length.desc(), color_sdf.color.asc()).show()

# # orderBy也是排序，返回的Row对象列表
color_sdf.orderBy('length','color').take(4)

  new_color  length  sex
2    yellow       6    0
0     white       5    0
1     green       5    0
4     brown       5    0
5      pink       4    0
3       red       3    0
+------+------+------+
| color|length|newCol|
+------+------+------+
|yellow|     6|     0|
| white|     5|     0|
|   red|     3|     0|
|  pink|     4|     0|
| green|     5|     0|
| brown|     5|     0|
+------+------+------+

+------+------+------+
| color|length|newCol|
+------+------+------+
|yellow|     6|     0|
| white|     5|     0|
| green|     5|     0|
| brown|     5|     0|
|  pink|     4|     0|
+------+------+------+

+------+------+------+
| color|length|newCol|
+------+------+------+
|yellow|     6|     0|
| brown|     5|     0|
| green|     5|     0|
| white|     5|     0|
|  pink|     4|     0|
|   red|     3|     0|
+------+------+------+



[Row(color='red', length=3, newCol=0),
 Row(color='pink', length=4, newCol=0),
 Row(color='brown', length=5, newCol=0),
 Row(color='green', length=5, newCol=0)]

In [73]:
# 缺失值
# 1.生成测试数据
df=pd.DataFrame(np.random.rand(5,5),columns=['a','b','c','d','e'])\
    .applymap(lambda x: int(x*10))
df.iloc[2,2]=np.nan


# spark
spark_df = spark.createDataFrame(df)
spark_df.show()

# 2.删除有缺失值的行
df2 = spark_df.dropna()
df2.show()

# 3.或者
df3 = spark_df.na.drop()
df3.show()

+---+---+---+---+---+
|  a|  b|  c|  d|  e|
+---+---+---+---+---+
|  1|  2|1.0|  7|  8|
|  1|  5|4.0|  8|  2|
|  9|  5|NaN|  6|  1|
|  9|  9|5.0|  2|  6|
|  7|  2|6.0|  0|  6|
+---+---+---+---+---+

+---+---+---+---+---+
|  a|  b|  c|  d|  e|
+---+---+---+---+---+
|  1|  2|1.0|  7|  8|
|  1|  5|4.0|  8|  2|
|  9|  9|5.0|  2|  6|
|  7|  2|6.0|  0|  6|
+---+---+---+---+---+

+---+---+---+---+---+
|  a|  b|  c|  d|  e|
+---+---+---+---+---+
|  1|  2|1.0|  7|  8|
|  1|  5|4.0|  8|  2|
|  9|  9|5.0|  2|  6|
|  7|  2|6.0|  0|  6|
+---+---+---+---+---+



In [79]:
# PD删除缺失值
print(df)
df = df.dropna()
print(df)

   a  b    c  d  e
0  1  2  1.0  7  8
1  1  5  4.0  8  2
2  9  5  NaN  6  1
3  9  9  5.0  2  6
4  7  2  6.0  0  6
   a  b    c  d  e
0  1  2  1.0  7  8
1  1  5  4.0  8  2
3  9  9  5.0  2  6
4  7  2  6.0  0  6
