In [1]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('my_first_app_name') \
    .getOrCreate()

In [2]:
# 从pandas dataframe创建spark dataframe
colors = ['white','green','yellow','red','brown','pink', 'white']
color_df = pd.DataFrame(colors,columns=['color'])
color_df['length'] = color_df['color'].apply(len)
print(color_df)

color_sdf = spark.createDataFrame(color_df)
color_sdf.show()

    color  length
0   white       5
1   green       5
2  yellow       6
3     red       3
4   brown       5
5    pink       4
6   white       5
+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
| white|     5|
+------+------+



In [5]:
# 1、随机数
# 基于dataframe生成相同行数的随机数
from pyspark.sql.functions import rand, randn  # 均匀分布和正太分布函数

color_sdf.select(rand(seed=10).alias("uniform"), 
                randn(seed=27).alias("normal"))\
    .show()

# 或者随机生成指定行数的dataframe
df = spark.range(0, 10).withColumn('rand1', rand(seed=10)) \
                       .withColumn('rand2', rand(seed=27))
df.show()

+-------------------+--------------------+
|            uniform|              normal|
+-------------------+--------------------+
| 0.1982919638208397| 0.06157382353970104|
|0.12030715258495939|  1.0854146699817222|
|0.44292918521277047| -0.4798519469521663|
| 0.2731073068483362|-0.15116027592854422|
|   0.87079354700073|-0.27674189870783683|
|0.27149331793166864|-0.18575112254167045|
| 0.6037143578435027|   0.734722467897308|
+-------------------+--------------------+

+---+-------------------+-------------------+
| id|              rand1|              rand2|
+---+-------------------+-------------------+
|  0|0.41371264720975787|  0.714105256846827|
|  1| 0.1982919638208397|0.19369846818250636|
|  2|0.12030715258495939| 0.8122802274304282|
|  3|0.44292918521277047|0.31429268272540556|
|  4| 0.8898784253886249|0.15864611152259134|
|  5| 0.2731073068483362| 0.3221262660507942|
|  6|   0.87079354700073| 0.3134176192702436|
|  7|0.27149331793166864| 0.3071632607232556|
|  8| 0.603714357843

In [6]:
# 2、四舍五入
from pyspark.sql.functions import round
df = spark.createDataFrame([(2.5,)], ['a'])

df.select(round('a', 0).alias('r')).show()

+---+
|  r|
+---+
|3.0|
+---+



In [14]:
# 3、抽样
color_sdf.show()

# 抽样
sample1 = color_sdf.sample(
    withReplacement=False, # 无放回抽样
    fraction=0.4,
    seed=1000)  
sample1.show()

+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
| white|     5|
+------+------+

+-----+------+
|color|length|
+-----+------+
|green|     5|
| pink|     4|
+-----+------+



In [15]:
# 4、描述性统计
# dataframe本身也有基本统计的方法，和pandas一致
# 1.生成测试数据
df = pd.DataFrame(np.random.rand(5,5),columns=['a','b','c','d','e']).\
    applymap(lambda x: int(x*10))
df.iloc[2,2]=np.nan

spark_df=spark.createDataFrame(df)
spark_df.show()

# 2.描述性统计信息：按列统计
spark_df.describe().show()

# 3.针对一个字段的统计信息
spark_df.describe('a').show()

+---+---+---+---+---+
|  a|  b|  c|  d|  e|
+---+---+---+---+---+
|  2|  0|8.0|  3|  8|
|  7|  0|9.0|  3|  7|
|  9|  6|NaN|  5|  2|
|  0|  6|7.0|  2|  1|
|  5|  9|5.0|  0|  0|
+---+---+---+---+---+

+-------+-----------------+-----------------+---+------------------+-----------------+
|summary|                a|                b|  c|                 d|                e|
+-------+-----------------+-----------------+---+------------------+-----------------+
|  count|                5|                5|  5|                 5|                5|
|   mean|              4.6|              4.2|NaN|               2.6|              3.6|
| stddev|3.646916505762094|4.024922359499621|NaN|1.8165902124584952|3.646916505762094|
|    min|                0|                0|5.0|                 0|                0|
|    max|                9|                9|NaN|                 5|                8|
+-------+-----------------+-----------------+---+------------------+-----------------+

+-------+--------

In [18]:
# 5、最大值最小值
from pyspark.sql.functions import min, max
color_sdf.select(min('length'), max('length')).show()

+-----------+-----------+
|min(length)|max(length)|
+-----------+-----------+
|          3|          6|
+-----------+-----------+



In [20]:
# 6、均值、标准差
from pyspark.sql.functions import mean, stddev  # 同样是在function里面
color_sdf.select(mean('length').alias('mean'),
                stddev('length').alias('stddev'))\
    .show()

+-----------------+------------------+
|             mean|            stddev|
+-----------------+------------------+
|4.714285714285714|0.9511897312113418|
+-----------------+------------------+



In [25]:
# 7、协方差
# 样本协方差
print(spark_df.stat.cov('a','b')) # 计算给定列的样本协方差（由它们的名称指定）作为双精度值。 

# 总体协方差 和 样本协方差
from pyspark.sql.functions import covar_pop, covar_samp
print(spark_df.agg(covar_pop("a", "b").alias('new_col')).collect()[0][0])  # 返回col1和col2的总体协方差的新列。
print(spark_df.agg(covar_samp("a", "b").alias('new_col')).collect()[0][0])  # 返回col1和col2的样本协方差的新列。

# 皮尔森相关系数
print(spark_df.stat.corr('a', 'b'))

0.5999999999999998
0.48
0.6
0.04087595596566437


In [26]:
# Pandas的相关系数
df.corr()

Unnamed: 0,a,b,c,d,e
a,1.0,0.040876,0.156941,0.460381,0.003759
b,0.040876,1.0,-0.94337,-0.43082,-0.981023
c,0.156941,-0.94337,1.0,0.966092,0.860565
d,0.460381,-0.43082,0.966092,1.0,0.384908
e,0.003759,-0.981023,0.860565,0.384908,1.0


In [35]:
# 8、交叉表(列联表)
# Create a DataFrame with two columns (name, item)
names = ["Alice", "Bob", "Mike"]
items = ["milk", "bread", "butter", "apples", "oranges"]
df = pd.DataFrame([(names[i % 3], items[i % 5]) for i in range(10)], columns=['name', 'item'])
print(df)
sdf = spark.createDataFrame(df)
sdf.show()

print(pd.crosstab(df['name'], df['item'], margins=True)) # PD
sdf.stat.crosstab("name", "item").show() # Spark

    name     item
0  Alice     milk
1    Bob    bread
2   Mike   butter
3  Alice   apples
4    Bob  oranges
5   Mike     milk
6  Alice    bread
7    Bob   butter
8   Mike   apples
9  Alice  oranges
+-----+-------+
| name|   item|
+-----+-------+
|Alice|   milk|
|  Bob|  bread|
| Mike| butter|
|Alice| apples|
|  Bob|oranges|
| Mike|   milk|
|Alice|  bread|
|  Bob| butter|
| Mike| apples|
|Alice|oranges|
+-----+-------+

item   apples  bread  butter  milk  oranges  All
name                                            
Alice       1      1       0     1        1    4
Bob         0      1       1     0        1    3
Mike        1      0       1     1        0    3
All         2      2       2     2        2   10
+---------+------+-----+------+----+-------+
|name_item|apples|bread|butter|milk|oranges|
+---------+------+-----+------+----+-------+
|      Bob|     0|    1|     1|   0|      1|
|     Mike|     1|    0|     1|   1|      0|
|    Alice|     1|    1|     0|   1|      1|
+---------+--

In [36]:
# 9、频繁项目元素
# 找出现次数最多的元素(频数分布)
df = spark.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) for i in range(100)],
                           ["a", "b", "c"])
df.show(10)

# 下面的代码找到 指定列 出现次数占总的40%以上频繁元素
df.stat.freqItems(["a", "b", "c"], 0.4).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  2|  1|
|  1|  2|  3|
|  3|  6|  3|
|  1|  2|  3|
|  5| 10|  1|
|  1|  2|  3|
|  7| 14|  3|
|  1|  2|  3|
|  9| 18|  1|
+---+---+---+
only showing top 10 rows

+-----------+-----------+-----------+
|a_freqItems|b_freqItems|c_freqItems|
+-----------+-----------+-----------+
|    [11, 1]|    [2, 22]|     [1, 3]|
+-----------+-----------+-----------+



In [None]:
# 10. 数学函数
| log | 对数 |
| log2 | 以2为底的对数 |
| factorial | 阶乘 |

In [39]:
# 11、元素去重计数
from pyspark.sql import functions as func
sdf = spark.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) for i in range(10)],
                           ["a", "b", "c"])
sdf.show()

# agg(*exprs)
# 在没有组的情况下汇总整个DataFrame（df.groupBy.agg（）的简写）。
sdf.agg(func.countDistinct('a')).show() # 看不懂

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  2|  1|
|  1|  2|  3|
|  3|  6|  3|
|  1|  2|  3|
|  5| 10|  1|
|  1|  2|  3|
|  7| 14|  3|
|  1|  2|  3|
|  9| 18|  1|
+---+---+---+

+-----------------+
|count(DISTINCT a)|
+-----------------+
|                5|
+-----------------+



In [42]:
sdf.agg({"a": "max"}).collect() # 等于 sdf.groupBy().agg({"a":"max"}).collect()

[Row(max(a)=9)]

In [44]:
# 区别于
sdf.groupBy("a").agg({"b":"max"}).collect()

[Row(a=7, max(b)=14),
 Row(a=9, max(b)=18),
 Row(a=5, max(b)=10),
 Row(a=1, max(b)=2),
 Row(a=3, max(b)=6)]

In [52]:
# 12、聚合函数 grouping 
sdf.cube("a").agg(func.grouping("a"), func.sum("b")).orderBy("a").show() # 看不懂

+----+-----------+------+
|   a|grouping(a)|sum(b)|
+----+-----------+------+
|null|          1|    60|
|   1|          0|    12|
|   3|          0|     6|
|   5|          0|    10|
|   7|          0|    14|
|   9|          0|    18|
+----+-----------+------+



In [None]:
# 13、聚合函数 grouping_id
df.cube("a").agg(grouping_id(), sum("b")).orderBy("a").show() # 看不懂