# PySpark # 2

##### Índice:

    # Importação bibliotecas / funções
    # Criar Sessão PySpark
    # Criar DF / ler arquivo
    # Drop de Colunas
    
######    Window Ranking Function
    # Window Function 1 - Numero de linhas - row_number()
    # Window Function 2 - Ranking 1 - rank()
    # Window Function 3 - Ranking 2 - dense_rank()
    # Window Function 4 - Porcentagem Ranking - percent_rank()
    # Window Function 5 - Divisão em ' N ' partes - ntile()
    
######    Window Analytic Functions (Funções analíticas)
    # Window Function 6 - LAG / Degrau - lag()
    # Window Function 7 - Lead / Degrau - lead()
    # Agregações
    # GroupBy + AGG 1
    # Where
    # Describe
    # Window Function 8 - Função de agregação usando Window Function

##### Importação bibliotecas / funções

In [1]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql.window import Window # Importando window function

##### Criar / Iniciar Sessão PySpark

In [4]:
spark = (
    SparkSession.builder
    .master('local')
    .appName('PySpark_02')
    .getOrCreate()
)
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001E0AEE57070>


##### Criar DF / ler arquivo

In [5]:
df = spark.read.csv('Arquivos/wc2018-players.csv', header=True, inferSchema=True)

##### Exibir DF

In [6]:
df.show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
only showing top 5 rows



##### Alterações Aula PySpark 01

In [7]:
df = df.withColumnRenamed('Team', 'Selecao').withColumnRenamed('#', 'Numero').withColumnRenamed('Pos.', 'Posicao')\
.withColumnRenamed('FIFA Popular Name', 'Nome_FIFA').withColumnRenamed('Birth Date', 'Nascimento')\
.withColumnRenamed('Shirt Name', 'Nome Camiseta').withColumnRenamed('Club', 'Time').withColumnRenamed('Height', 'Altura')\
.withColumnRenamed('Weight', 'Peso')

In [8]:
dia = udf(lambda data: data.split('.')[0])
mes = udf(lambda data: data.split('.')[1])
ano = udf(lambda data: data.split('.')[2])

In [9]:
df = df.withColumn('Dia', dia('Nascimento')).withColumn('Mes', mes('Nascimento')).withColumn('Ano', ano('nascimento'))
df = df.withColumn('Data_Nascimento', concat_ws('-', 'Ano', 'Mes', 'Dia').cast(DateType()))
df.show(5)

+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nascimento|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|15.02.1993|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|24.06.1987|        MESSI

In [11]:
df.printSchema()

root
 |-- Selecao: string (nullable = true)
 |-- Numero: integer (nullable = true)
 |-- Posicao: string (nullable = true)
 |-- Nome_FIFA: string (nullable = true)
 |-- Nascimento: string (nullable = true)
 |-- Nome Camiseta: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Altura: integer (nullable = true)
 |-- Peso: integer (nullable = true)
 |-- Dia: string (nullable = true)
 |-- Mes: string (nullable = true)
 |-- Ano: string (nullable = true)
 |-- Data_Nascimento: date (nullable = true)



##### Fim das alterações Aula PySpark 01

In [12]:
df.toPandas()

Unnamed: 0,Selecao,Numero,Posicao,Nome_FIFA,Nascimento,Nome Camiseta,Time,Altura,Peso,Dia,Mes,Ano,Data_Nascimento
0,Argentina,3,DF,TAGLIAFICO Nicolas,31.08.1992,TAGLIAFICO,AFC Ajax (NED),169,65,31,08,1992,1992-08-31
1,Argentina,22,MF,PAVON Cristian,21.01.1996,PAVÓN,CA Boca Juniors (ARG),169,65,21,01,1996,1996-01-21
2,Argentina,15,MF,LANZINI Manuel,15.02.1993,LANZINI,West Ham United FC (ENG),167,66,15,02,1993,1993-02-15
3,Argentina,18,DF,SALVIO Eduardo,13.07.1990,SALVIO,SL Benfica (POR),167,69,13,07,1990,1990-07-13
4,Argentina,10,FW,MESSI Lionel,24.06.1987,MESSI,FC Barcelona (ESP),170,72,24,06,1987,1987-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,Uruguay,15,MF,VECINO Matias,24.08.1991,VECINO,FC Internazionale (ITA),189,80,24,08,1991,1991-08-24
732,Uruguay,23,GK,SILVA Martin,25.03.1983,M. SILVA,CR Vasco da Gama (BRA),187,82,25,03,1983,1983-03-25
733,Uruguay,9,FW,SUAREZ Luis,24.01.1987,SUAREZ,FC Barcelona (ESP),182,85,24,01,1987,1987-01-24
734,Uruguay,18,FW,GOMEZ Maximiliano,14.08.1996,M. GOMEZ,Celta Vigo (ESP),186,85,14,08,1996,1996-08-14


##### Drop de Colunas

In [13]:
df = df.drop('Nascimento')

In [15]:
df.toPandas()

Unnamed: 0,Selecao,Numero,Posicao,Nome_FIFA,Nome Camiseta,Time,Altura,Peso,Dia,Mes,Ano,Data_Nascimento
0,Argentina,3,DF,TAGLIAFICO Nicolas,TAGLIAFICO,AFC Ajax (NED),169,65,31,08,1992,1992-08-31
1,Argentina,22,MF,PAVON Cristian,PAVÓN,CA Boca Juniors (ARG),169,65,21,01,1996,1996-01-21
2,Argentina,15,MF,LANZINI Manuel,LANZINI,West Ham United FC (ENG),167,66,15,02,1993,1993-02-15
3,Argentina,18,DF,SALVIO Eduardo,SALVIO,SL Benfica (POR),167,69,13,07,1990,1990-07-13
4,Argentina,10,FW,MESSI Lionel,MESSI,FC Barcelona (ESP),170,72,24,06,1987,1987-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...
731,Uruguay,15,MF,VECINO Matias,VECINO,FC Internazionale (ITA),189,80,24,08,1991,1991-08-24
732,Uruguay,23,GK,SILVA Martin,M. SILVA,CR Vasco da Gama (BRA),187,82,25,03,1983,1983-03-25
733,Uruguay,9,FW,SUAREZ Luis,SUAREZ,FC Barcelona (ESP),182,85,24,01,1987,1987-01-24
734,Uruguay,18,FW,GOMEZ Maximiliano,M. GOMEZ,Celta Vigo (ESP),186,85,14,08,1996,1996-08-14


##### Criar Backup

In [16]:
df2 = df

#### Window Ranking Functions

- Window Function 1 - Numero de linhas - row_number()
- Window Function 2 - Ranking 1 - rank()
- Window Function 3 - Ranking 2 - dense_rank()
- Window Function 4 - Porcentagem Ranking - percent_rank()
- Window Function 5 - Divisão em ' N ' partes - ntile()

In [17]:
df.show(5)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|
+---------+------+------

##### Window Function 1 - Numero de linhas - row_number()

In [20]:
num_linha = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('n', row_number().over(num_linha)).show(30)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|  n|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|  2|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|  3|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|  4|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-

##### Window Function 2 - Ranking 1 - rank()

In [21]:
rank1 = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('rank', rank().over(rank1)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+----+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|rank|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+----+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|   1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|   2|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|   3|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|   3|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1

##### Window Function 3 - Ranking 2 - dense_rank()

In [23]:
rank2 = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('rank2', dense_rank().over(rank2)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+-----+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|rank2|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+-----+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|    1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|    2|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|    3|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|    3|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981

##### Window Function 4 - Porcentagem Ranking - percent_rank()

In [28]:
porcentagem = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('%', percent_rank().over(porcentagem)).limit(50).toPandas()

Unnamed: 0,Selecao,Numero,Posicao,Nome_FIFA,Nome Camiseta,Time,Altura,Peso,Dia,Mes,Ano,Data_Nascimento,%
0,Argentina,6,DF,FAZIO Federico,FAZIO,AS Roma (ITA),199,85,17,3,1987,1987-03-17,0.0
1,Argentina,1,GK,GUZMAN Nahuel,GUZMÁN,Tigres UANL (MEX),192,90,10,2,1986,1986-02-10,0.045455
2,Argentina,16,DF,ROJO Marcos,ROJO,Manchester United FC (ENG),189,82,20,3,1990,1990-03-20,0.090909
3,Argentina,12,GK,ARMANI Franco,ARMANI,CA River Plate (ARG),189,85,16,10,1986,1986-10-16,0.090909
4,Argentina,23,GK,CABALLERO Wilfredo,CABALLERO,Chelsea FC (ENG),186,80,28,9,1981,1981-09-28,0.181818
5,Argentina,9,FW,HIGUAIN Gonzalo,HIGUAÍN,Juventus FC (ITA),184,75,10,12,1987,1987-12-10,0.227273
6,Argentina,4,DF,ANSALDI Cristian,ANSALDI,Torino FC (ITA),181,73,20,9,1986,1986-09-20,0.272727
7,Argentina,2,DF,MERCADO Gabriel,MERCADO,Sevilla FC (ESP),181,81,18,3,1987,1987-03-18,0.272727
8,Argentina,17,DF,OTAMENDI Nicolas,OTAMENDI,Manchester City FC (ENG),181,81,12,2,1988,1988-02-12,0.272727
9,Argentina,13,MF,MEZA Maximiliano,MEZA,CA Independiente (ARG),180,76,15,12,1992,1992-12-15,0.409091


##### Window Function 5 - Divisão em ' N ' partes - ntile()

In [32]:
parte = Window.partitionBy('Selecao').orderBy(desc('Altura'))

df.withColumn('Par', ntile(5).over(parte)).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|Par|
+---------+------+-------+------------------+-------------+--------------------+------+----+---+---+----+---------------+---+
|Argentina|     6|     DF|    FAZIO Federico|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  1|
|Argentina|     1|     GK|     GUZMAN Nahuel|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|  1|
|Argentina|    16|     DF|       ROJO Marcos|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|  1|
|Argentina|    12|     GK|     ARMANI Franco|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|  1|
|Argentina|    23|     GK|CABALLERO Wilfredo|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-

#### Window Analytic Functions (Funções analíticas)

In [33]:
df = df.drop('Nome_Fifa')

##### Window Function 6 - LAG / Degrau - lag()

In [34]:
degrau = Window.partitionBy('Selecao').orderBy(desc("Altura"))

df.withColumn('degrau', lag('Altura', 5).over(degrau)).show(50)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|degrau|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|Argentina|     6|     DF|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  null|
|Argentina|     1|     GK|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|  null|
|Argentina|    16|     DF|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|  null|
|Argentina|    12|     GK|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|  null|
|Argentina|    23|     GK|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-28|  null|
|Argentina|     9|     FW|      HIGUAÍN|   Juventus FC (ITA)|   184|  75| 10| 12|1987|     1987-12-10|   199|
|Argentina

##### Window Function 7 - Lead / Degrau - lead()

In [35]:
degrau = Window.partitionBy('Selecao').orderBy(desc("Altura"))

df.withColumn('degrau', lead('Altura').over(degrau)).show(50)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|degrau|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+------+
|Argentina|     6|     DF|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|   192|
|Argentina|     1|     GK|       GUZMÁN|   Tigres UANL (MEX)|   192|  90| 10| 02|1986|     1986-02-10|   189|
|Argentina|    16|     DF|         ROJO|Manchester United...|   189|  82| 20| 03|1990|     1990-03-20|   189|
|Argentina|    12|     GK|       ARMANI|CA River Plate (ARG)|   189|  85| 16| 10|1986|     1986-10-16|   186|
|Argentina|    23|     GK|    CABALLERO|    Chelsea FC (ENG)|   186|  80| 28| 09|1981|     1981-09-28|   184|
|Argentina|     9|     FW|      HIGUAÍN|   Juventus FC (ITA)|   184|  75| 10| 12|1987|     1987-12-10|   181|
|Argentina

#### Agregações

##### GroupBy + AGG 1

In [42]:
df.groupBy('Selecao').agg({'Altura':'avg'}).orderBy('avg(Altura)', ascending=False).toPandas()

Unnamed: 0,Selecao,avg(Altura)
0,Serbia,186.695652
1,Denmark,186.608696
2,Germany,185.782609
3,Sweden,185.73913
4,Iceland,185.521739
5,Belgium,185.347826
6,Croatia,185.26087
7,Nigeria,184.521739
8,IR Iran,184.478261
9,Russia,184.391304


##### GroupBy + AGG 2

In [46]:
df.groupBy('Selecao').agg(max('Altura')).orderBy('max(Altura)', ascending=False).toPandas()

Unnamed: 0,Selecao,max(Altura)
0,Croatia,201
1,Denmark,200
2,Argentina,199
3,Belgium,199
4,Sweden,198
5,Iceland,198
6,France,197
7,Nigeria,197
8,Korea Republic,197
9,Panama,197


##### Where

In [49]:
df.where('Selecao = "Brazil"').where('Posicao = "DF"').show(25)

where((condicao 1) & | (condicao))

+-------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+
|Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+-------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+
| Brazil|    22|     DF|       FAGNER|SC Corinthians (BRA)|   168|  67| 11| 06|1989|     1989-06-11|
| Brazil|     6|     DF|  FILIPE LUIS|Atletico Madrid (...|   182|  73| 09| 08|1985|     1985-08-09|
| Brazil|    13|     DF|   MARQUINHOS|Paris Saint-Germa...|   183|  75| 14| 05|1994|     1994-05-14|
| Brazil|     3|     DF|      MIRANDA|FC Internazionale...|   186|  78| 07| 09|1984|     1984-09-07|
| Brazil|    14|     DF|       DANILO|Manchester City F...|   184|  78| 15| 07|1991|     1991-07-15|
| Brazil|     2|     DF|     T. SILVA|Paris Saint-Germa...|   183|  79| 22| 09|1984|     1984-09-22|
| Brazil|    12|     DF|      MARCELO|Real Madrid CF (ESP)|   174|  80| 12| 05|1988|     19

In [69]:
top1 = Window.partitionBy('Selecao').orderBy(desc("Altura"))

df.withColumn('Top', row_number().over(top1)).filter('Top = "1"').show(50)

+--------------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+---+
|       Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|Top|
+--------------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+---+
|     Argentina|     6|     DF|        FAZIO|       AS Roma (ITA)|   199|  85| 17| 03|1987|     1987-03-17|  1|
|     Australia|    12|     GK|        JONES|Feyenoord Rotterd...|   193|  87| 19| 03|1982|     1982-03-19|  1|
|       Belgium|     1|     GK|     COURTOIS|    Chelsea FC (ENG)|   199|  91| 11| 05|1992|     1992-05-11|  1|
|        Brazil|    16|     GK|       CASSIO|SC Corinthians (BRA)|   195|  92| 06| 06|1987|     1987-06-06|  1|
|      Colombia|    13|     DF|      Y. MINA|  FC Barcelona (ESP)|   194|  95| 23| 09|1994|     1994-09-23|  1|
|    Costa Rica|    19|     DF|    K. WASTON|Vancouver Whiteca...|   196|  87| 01| 01|1988|     1988-01-

##### Describe

In [51]:
df.describe().toPandas()

Unnamed: 0,summary,Selecao,Numero,Posicao,Nome Camiseta,Time,Altura,Peso,Dia,Mes,Ano
0,count,736,736.0,736,736,736,736.0,736.0,736.0,736.0,736.0
1,mean,,12.0,,,,182.4076086956522,77.18885869565217,15.793478260869565,5.8790760869565215,1990.110054347826
2,stddev,,6.637760461599851,,,,6.930924233929302,7.233778346883639,8.761123828732469,3.3782493094684387,3.9074472063626775
3,min,Argentina,1.0,DF,A. ASHRAF,1. FC Köln (GER),165.0,59.0,1.0,1.0,1973.0
4,max,Uruguay,23.0,MF,ŽIVKOVIĆ,Étoile du Sahel (TUN),201.0,99.0,31.0,12.0,1999.0


In [52]:
df.where('Selecao = "Brazil"').describe().toPandas()

Unnamed: 0,summary,Selecao,Numero,Posicao,Nome Camiseta,Time,Altura,Peso,Dia,Mes,Ano
0,count,23,23.0,23,23,23,23.0,23.0,23.0,23.0,23.0
1,mean,,12.0,,,,180.7826086956522,76.56521739130434,11.26086956521739,6.130434782608695,1989.391304347826
2,stddev,,6.782329983125267,,,,7.354383490255254,8.239737898283606,6.876953549252538,2.784769418006175,3.499858833968506
3,min,Brazil,1.0,DF,A. BECKER,AS Roma (ITA),168.0,64.0,2.0,1.0,1984.0
4,max,Brazil,23.0,MF,WILLIAN,SC Corinthians (BRA),195.0,92.0,25.0,10.0,1997.0


##### Window Function 8 - Função de agregação usando Window Function

In [53]:
parametro = Window.partitionBy('Selecao').orderBy(desc("Altura"))
parametro2 = Window.partitionBy('Selecao')

df.withColumn('linhax', row_number().over(parametro))\
\
.withColumn('media', avg('Altura').over(parametro2))\
.withColumn('max', max('Altura').over(parametro2))\
.withColumn('min', min('Altura').over(parametro2))\
.filter('linhax = "1"').select('Selecao', 'media', 'max', 'min')\
.orderBy('media', ascending=False).toPandas()

Unnamed: 0,Selecao,media,max,min
0,Serbia,186.695652,195,169
1,Denmark,186.608696,200,171
2,Germany,185.782609,195,176
3,Sweden,185.73913,198,177
4,Iceland,185.521739,198,170
5,Belgium,185.347826,199,169
6,Croatia,185.26087,201,172
7,Nigeria,184.521739,197,172
8,IR Iran,184.478261,194,177
9,Russia,184.391304,196,173


In [55]:
parametro = Window.partitionBy('Selecao').orderBy(desc("Altura"))
parametro2 = Window.partitionBy('Selecao')

df.withColumn('linhax', row_number().over(parametro))\
\
.withColumn('media', avg('Altura').over(parametro2))\
.withColumn('max', max('Altura').over(parametro2))\
.withColumn('min', min('Altura').over(parametro2))\
.filter('linhax = "1"').select('Selecao', 'media', 'max', 'min')\
.orderBy('media', ascending=False).toPandas()

Unnamed: 0,Selecao,media,max,min
0,Serbia,186.695652,195,169
1,Denmark,186.608696,200,171
2,Germany,185.782609,195,176
3,Sweden,185.73913,198,177
4,Iceland,185.521739,198,170
5,Belgium,185.347826,199,169
6,Croatia,185.26087,201,172
7,Nigeria,184.521739,197,172
8,IR Iran,184.478261,194,177
9,Russia,184.391304,196,173
