In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("s8a-dataframes-sql").getOrCreate()

# Lectura de CSV con el ; como separador de columnas y con encabezado
df = spark.read.option("sep",";").option("header", "true").option("inferSchema", "true").csv("pdi_sales_small.csv")
    
df.printSchema()



root
 |-- ProductID: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Units: integer (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Country: string (nullable = true)



                                                                                

In [19]:
from pyspark.sql.functions import col, trim, to_date
df = df.withColumn("Date", to_date(df.Date, "M/d/yyy"))

In [23]:
df.select("*").where(df.Country=="Canada ").show()

+---------+----------+---------------+-----+-------+-------+
|ProductID|      Date|            Zip|Units|Revenue|Country|
+---------+----------+---------------+-----+-------+-------+
|      725|1999-01-15|H1B            |    1|  115.4|Canada |
|     2235|1999-01-15|H1B            |    2|  131.1|Canada |
|      713|1999-01-15|H1B            |    1|  160.1|Canada |
|      574|2002-06-05|H1B            |    1|  869.1|Canada |
|       94|1999-02-15|H1B            |    1|  866.2|Canada |
|      609|1999-02-15|H1B            |    1|  778.8|Canada |
|     2064|1999-03-15|H1B            |    2|  976.4|Canada |
|      714|1999-01-15|H1B            |    1|  160.1|Canada |
|      826|2002-05-31|H1B            |    1|  944.9|Canada |
|     2149|2002-06-06|H1B            |    2|  871.4|Canada |
|      992|1999-02-15|H1B            |    1|  288.7|Canada |
|      726|1999-01-15|M4X            |    1|  115.4|Canada |
|      725|1999-01-15|M4X            |    1|  115.4|Canada |
|      910|1999-03-15|M4

In [21]:
from pyspark.sql.functions import *
df.select("Date", date_format("Date", "dd-MM-yyy"), next_day("Date", "Sun"), last_day("Date"),
              dayofmonth("Date"), dayofyear("Date"), month("Date"), year("Date")).show(3)

+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
|      Date|date_format(Date, dd-MM-yyy)|next_day(Date, Sun)|last_day(Date)|dayofmonth(Date)|dayofyear(Date)|month(Date)|year(Date)|
+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
|1999-01-15|                  15-01-1999|         1999-01-17|    1999-01-31|              15|             15|          1|      1999|
|2002-06-06|                  06-06-2002|         2002-06-09|    2002-06-30|               6|            157|          6|      2002|
|2002-06-06|                  06-06-2002|         2002-06-09|    2002-06-30|               6|            157|          6|      2002|
+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
only showing top 3 rows



In [37]:
df.select("Zip", ltrim("Zip").alias("l"), rtrim("Zip").alias("r"), 
         lower("Zip"), upper("Zip")
         ).where(trim(df.Country)=="Canada").show(1)

+---------------+---------------+---+---------------+---------------+
|            Zip|              l|  r|     lower(Zip)|     upper(Zip)|
+---------------+---------------+---+---------------+---------------+
|H1B            |H1B            |H1B|h1b            |H1B            |
+---------------+---------------+---+---------------+---------------+
only showing top 1 row



In [45]:
df.select("Country", initcap(lower("Country")), reverse("Country"),
          length("Country"), translate("Country", "na", "pe")
         ).where(trim(df.Country)=="Canada").show(1)

+-------+-----------------------+----------------+---------------+--------------------------+
|Country|initcap(lower(Country))|reverse(Country)|length(Country)|translate(Country, na, pe)|
+-------+-----------------------+----------------+---------------+--------------------------+
|Canada |                Canada |          adanaC|              7|                   Cepede |
+-------+-----------------------+----------------+---------------+--------------------------+
only showing top 1 row



In [48]:
df.select("Country", split("Country", "a"), locate("a", "Country"),
          substring("Country",3,2)
         ).where(trim(df.Country)=="Canada").show(1)

+-------+---------------------+---------------------+------------------------+
|Country|split(Country, a, -1)|locate(a, Country, 1)|substring(Country, 3, 2)|
+-------+---------------------+---------------------+------------------------+
|Canada |         [C, n, d,  ]|                    2|                      na|
+-------+---------------------+---------------------+------------------------+
only showing top 1 row

