In [1]:
import os

os.environ['JAVA_HOME'] = "C:/Program Files/Java/jdk-11"
os.environ['PYSPARK_PYTHON'] = "C:/Users/usr/anaconda3/envs/pyspark_env/python.exe"
os.environ['PYSPARK_DRIVER_PYTHON'] = "C:/Users/usr/anaconda3/envs/pyspark_env/python.exe"
os.environ['HADOOP_HOME'] = "C:/hadoop-3.4.0"
os.environ['HADOOP_COMMON_LIB_NATIVE_DIR'] = "C:/hadoop-3.4.0/lib/native"
os.environ['PATH'] += os.pathsep + "C:/hadoop-3.4.0/bin"

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext

In [2]:
df = spark.read.parquet('./data/data/vuelos.parquet')

In [3]:
df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

In [4]:
df.show(20, truncate=False)

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

### **Funciones count(), countDistinct() y approx_count_distinct()**

In [5]:
df1 = spark.read.parquet('./data/data/dataframe.parquet')

In [6]:
df1.printSchema()

root
 |-- nombre: string (nullable = true)
 |-- color: string (nullable = true)
 |-- cantidad: long (nullable = true)



In [7]:
df1.show()

+------+-----+--------+
|nombre|color|cantidad|
+------+-----+--------+
|  Jose| azul|    1900|
|  null| null|    1700|
|  null| rojo|    1300|
|  Juan| rojo|    1500|
+------+-----+--------+



- **count()**

- Cuenta aquellos que no son nulos.

In [8]:
from pyspark.sql.functions import count

In [9]:
df1.select(
    count('nombre').alias('conteo_nombre'),
    count('color').alias('conteo_color')
).show()

+-------------+------------+
|conteo_nombre|conteo_color|
+-------------+------------+
|            2|           3|
+-------------+------------+



- count('*') cuenta todas las filas, aunque haya al algún null en ellas.

In [10]:
df1.select(
    count('nombre').alias('conteo_nombre'),
    count('color').alias('conteo_color'),
    count('*'). alias('conteo_general')
).show()

+-------------+------------+--------------+
|conteo_nombre|conteo_color|conteo_general|
+-------------+------------+--------------+
|            2|           3|             4|
+-------------+------------+--------------+



- **countDistinct()**

In [11]:
from pyspark.sql.functions import countDistinct

In [12]:
df1.select(
    countDistinct('color').alias('colores_dif')
).show()

+-----------+
|colores_dif|
+-----------+
|          2|
+-----------+



- **approx_count_distinct()**

In [13]:
from pyspark.sql.functions import approx_count_distinct

In [14]:
dataframe = spark.read.parquet('./data/data/vuelos.parquet')

In [15]:
dataframe.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

- En una base de datos grande hacer un count() puede ser costoso, nos bastará muchas veces con un conteo aproximado.

In [16]:
dataframe.select(
    countDistinct('AIRLINE'),
    approx_count_distinct('AIRLINE')
).show()

+-----------------------+------------------------------+
|count(DISTINCT AIRLINE)|approx_count_distinct(AIRLINE)|
+-----------------------+------------------------------+
|                     14|                            13|
+-----------------------+------------------------------+



### **Funciones min() y max()**

In [19]:
vuelos = spark.read.parquet('./data/data/vuelos.parquet')

In [21]:
vuelos.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

In [22]:
from pyspark.sql.functions import min, max, col

In [23]:
vuelos.select(
    min('AIR_TIME').alias('menor_tiempo'),
    max('AIR_TIME').alias('mayor_tiempo')
).show()

+------------+------------+
|menor_tiempo|mayor_tiempo|
+------------+------------+
|           7|         690|
+------------+------------+



In [24]:
vuelos.select(
    min('AIRLINE_DELAY'),
    max('AIRLINE_DELAY')
).show()

+------------------+------------------+
|min(AIRLINE_DELAY)|max(AIRLINE_DELAY)|
+------------------+------------------+
|                 0|              1971|
+------------------+------------------+



### **Funciones sum(), sumDistinct() y avg()**

- gitpush