In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("s8a-dataframes-sql").getOrCreate()

# Lectura de CSV con el ; como separador de columnas y con encabezado
df = spark.read.option("sep",";").option("header", "true").option("inferSchema", "true").csv("pdi_sales_small.csv")
    
df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Units: integer (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Country: string (nullable = true)



In [3]:
from pyspark.sql.functions import col, trim, to_date
df = df.withColumn("Date", to_date(df.Date, "M/d/yyy"))

In [4]:
df.select("*").where(df.Country=="Canada ").show()

+---------+----------+---------------+-----+-------+-------+
|ProductID|      Date|            Zip|Units|Revenue|Country|
+---------+----------+---------------+-----+-------+-------+
|      725|1999-01-15|H1B            |    1|  115.4|Canada |
|     2235|1999-01-15|H1B            |    2|  131.1|Canada |
|      713|1999-01-15|H1B            |    1|  160.1|Canada |
|      574|2002-06-05|H1B            |    1|  869.1|Canada |
|       94|1999-02-15|H1B            |    1|  866.2|Canada |
|      609|1999-02-15|H1B            |    1|  778.8|Canada |
|     2064|1999-03-15|H1B            |    2|  976.4|Canada |
|      714|1999-01-15|H1B            |    1|  160.1|Canada |
|      826|2002-05-31|H1B            |    1|  944.9|Canada |
|     2149|2002-06-06|H1B            |    2|  871.4|Canada |
|      992|1999-02-15|H1B            |    1|  288.7|Canada |
|      726|1999-01-15|M4X            |    1|  115.4|Canada |
|      725|1999-01-15|M4X            |    1|  115.4|Canada |
|      910|1999-03-15|M4

In [5]:
from pyspark.sql.functions import *
df.select("Date", date_format("Date", "dd-MM-yyy"), next_day("Date", "Sun"), last_day("Date"),
              dayofmonth("Date"), dayofyear("Date"), month("Date"), year("Date")).show(3)

+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
|      Date|date_format(Date, dd-MM-yyy)|next_day(Date, Sun)|last_day(Date)|dayofmonth(Date)|dayofyear(Date)|month(Date)|year(Date)|
+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
|1999-01-15|                  15-01-1999|         1999-01-17|    1999-01-31|              15|             15|          1|      1999|
|2002-06-06|                  06-06-2002|         2002-06-09|    2002-06-30|               6|            157|          6|      2002|
|2002-06-06|                  06-06-2002|         2002-06-09|    2002-06-30|               6|            157|          6|      2002|
+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
only showing top 3 rows



In [6]:
df.select("Zip", ltrim("Zip").alias("l"), rtrim("Zip").alias("r"), 
         lower("Zip"), upper("Zip")
         ).where(trim(df.Country)=="Canada").show(1)

+---------------+---------------+---+---------------+---------------+
|            Zip|              l|  r|     lower(Zip)|     upper(Zip)|
+---------------+---------------+---+---------------+---------------+
|H1B            |H1B            |H1B|h1b            |H1B            |
+---------------+---------------+---+---------------+---------------+
only showing top 1 row



In [7]:
df.select("Country", initcap(lower("Country")), reverse("Country"),
          length("Country"), translate("Country", "na", "pe")
         ).where(trim(df.Country)=="Canada").show(1)

+-------+-----------------------+----------------+---------------+--------------------------+
|Country|initcap(lower(Country))|reverse(Country)|length(Country)|translate(Country, na, pe)|
+-------+-----------------------+----------------+---------------+--------------------------+
|Canada |                Canada |          adanaC|              7|                   Cepede |
+-------+-----------------------+----------------+---------------+--------------------------+
only showing top 1 row



In [8]:
df.select("Country", split("Country", "a"), locate("a", "Country"),
          substring("Country",3,2)
         ).where(trim(df.Country)=="Canada").show(1)

+-------+---------------------+---------------------+------------------------+
|Country|split(Country, a, -1)|locate(a, Country, 1)|substring(Country, 3, 2)|
+-------+---------------------+---------------------+------------------------+
|Canada |         [C, n, d,  ]|                    2|                      na|
+-------+---------------------+---------------------+------------------------+
only showing top 1 row



In [9]:
df.select("ProductID", "Revenue", "Units").sort("Units", ascending=False).show(5)

+---------+-------+-----+
|ProductID|Revenue|Units|
+---------+-------+-----+
|      495|43194.1|   77|
|     2091| 6347.7|   41|
|     2091| 6240.1|   41|
|     2091| 3652.7|   24|
|     2091| 3560.9|   23|
+---------+-------+-----+
only showing top 5 rows



In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def bonus(unidades, ventas):
    if unidades == 1 :
        return 0.0
    else:
        return unidades * ventas / 100
    
udfBonus = udf(bonus, DoubleType())

In [11]:
df.select("ProductID", "Revenue", "Units", udfBonus(df.Units, df.Revenue)).sort("Units", ascending=False).show(5)

+---------+-------+-----+---------------------+
|ProductID|Revenue|Units|bonus(Units, Revenue)|
+---------+-------+-----+---------------------+
|      495|43194.1|   77|   33259.456999999995|
|     2091| 6347.7|   41|             2602.557|
|     2091| 6240.1|   41|   2558.4410000000003|
|     2091| 3652.7|   24|    876.6479999999999|
|     2091| 3560.9|   23|              819.007|
+---------+-------+-----+---------------------+
only showing top 5 rows



In [13]:
df.createOrReplaceTempView("ventas")

In [18]:
spark.udf.register("udfBonus", bonus, DoubleType())
spark.sql("select ProductID, Revenue, Units,  udfBonus(Units, Revenue) as bonus from ventas order by Units desc").show(5)

+---------+-------+-----+------------------+
|ProductID|Revenue|Units|             bonus|
+---------+-------+-----+------------------+
|      495|43194.1|   77|33259.456999999995|
|     2091| 6347.7|   41|          2602.557|
|     2091| 6240.1|   41|2558.4410000000003|
|     2091| 3652.7|   24| 876.6479999999999|
|     2091| 3560.9|   23|           819.007|
+---------+-------+-----+------------------+
only showing top 5 rows

