### Dataframes, Spark SQL e Parquet - practica

In [1]:
# Carga o ficheiro purchases.txt desde o sistema de ficheiros localabs

In [1]:
df = spark.read.csv("file:///home/hduser/Escritorio/purchases.txt", sep = "\t")

In [2]:
# Crea unha táboa sobre o dataframe para poder realizar consultas SQL

In [2]:
df = df.withColumnRenamed("_c0", "fecha")\
       .withColumnRenamed("_c1", "hora")\
       .withColumnRenamed("_c2", "tenda")\
       .withColumnRenamed("_c3", "categoria")\
       .withColumnRenamed("_c4", "vendas")\
       .withColumnRenamed("_c5", "pago")
df.createOrReplaceTempView('purchases')

In [3]:
# Mostra os datos do dataframe

In [3]:
df.show()

+----------+-----+--------------+--------------------+------+----------+
|     fecha| hora|         tenda|           categoria|vendas|      pago|
+----------+-----+--------------+--------------------+------+----------+
|2012-01-01|09:00|      San Jose|      Men's Clothing|214.05|      Amex|
|2012-01-01|09:00|    Fort Worth|    Women's Clothing|153.57|      Visa|
|2012-01-01|09:00|     San Diego|               Music| 66.08|      Cash|
|2012-01-01|09:00|    Pittsburgh|        Pet Supplies|493.51|  Discover|
|2012-01-01|09:00|         Omaha| Children's Clothing|235.63|MasterCard|
|2012-01-01|09:00|      Stockton|      Men's Clothing|247.18|MasterCard|
|2012-01-01|09:00|        Austin|             Cameras| 379.6|      Visa|
|2012-01-01|09:00|      New York|Consumer Electronics| 296.8|      Cash|
|2012-01-01|09:00|Corpus Christi|                Toys| 25.38|  Discover|
|2012-01-01|09:00|    Fort Worth|                Toys|213.88|      Visa|
|2012-01-01|09:00|     Las Vegas|         Video Gam

In [4]:
# Mostra os datos da táboa cunha consulta SQL

In [4]:
spark.sql('SELECT * FROM purchases').show()

+----------+-----+--------------+--------------------+------+----------+
|     fecha| hora|         tenda|           categoria|vendas|      pago|
+----------+-----+--------------+--------------------+------+----------+
|2012-01-01|09:00|      San Jose|      Men's Clothing|214.05|      Amex|
|2012-01-01|09:00|    Fort Worth|    Women's Clothing|153.57|      Visa|
|2012-01-01|09:00|     San Diego|               Music| 66.08|      Cash|
|2012-01-01|09:00|    Pittsburgh|        Pet Supplies|493.51|  Discover|
|2012-01-01|09:00|         Omaha| Children's Clothing|235.63|MasterCard|
|2012-01-01|09:00|      Stockton|      Men's Clothing|247.18|MasterCard|
|2012-01-01|09:00|        Austin|             Cameras| 379.6|      Visa|
|2012-01-01|09:00|      New York|Consumer Electronics| 296.8|      Cash|
|2012-01-01|09:00|Corpus Christi|                Toys| 25.38|  Discover|
|2012-01-01|09:00|    Fort Worth|                Toys|213.88|      Visa|
|2012-01-01|09:00|     Las Vegas|         Video Gam

In [5]:
# Conta o número de vendas por tenda

In [5]:
spark.sql('SELECT tenda, COUNT(vendas) FROM purchases GROUP BY tenda').show()



+---------------+-------------+
|          tenda|count(vendas)|
+---------------+-------------+
|North Las Vegas|        40013|
|        Phoenix|        40333|
|          Omaha|        40209|
|      Anchorage|        39806|
|        Anaheim|        40086|
|     Greensboro|        40232|
|         Dallas|        40368|
|        Oakland|        39728|
|         Laredo|        40342|
|     Scottsdale|        40173|
|    San Antonio|        40197|
|    Bakersfield|        40326|
|        Raleigh|        40261|
|    Chula Vista|        40080|
|   Philadelphia|        40748|
|     Louisville|        40099|
|    Los Angeles|        40254|
|       Chandler|        39826|
|     Sacramento|        40561|
|   Indianapolis|        40321|
+---------------+-------------+
only showing top 20 rows



                                                                                

In [6]:
# Fai un total de vendas por categoría

In [6]:
spark.sql('SELECT categoria, COUNT(vendas) FROM purchases GROUP BY categoria').show()

[Stage 6:>                                                          (0 + 4) / 4]

+--------------------+-------------+
|           categoria|count(vendas)|
+--------------------+-------------+
| Children's Clothing|       230469|
|      Sporting Goods|       229932|
|                 CDs|       230039|
|           Computers|       229059|
|Consumer Electronics|       229761|
|   Health and Beauty|       229667|
|        Pet Supplies|       229222|
|                DVDs|       230274|
|                Baby|       230293|
|              Crafts|       229749|
|    Women's Clothing|       230050|
|         Video Games|       230237|
|               Books|       229787|
|               Music|       230150|
|      Men's Clothing|       230430|
|             Cameras|       229320|
|              Garden|       230073|
|                Toys|       229964|
+--------------------+-------------+





In [7]:
# Garda o dataframe en HDFS en formato parquet, con particións para as diferentes tendas

In [7]:
df.write.partitionBy("tenda").mode("overwrite").parquet("file:///home/hduser/Escritorio/tendas.parquet")

                                                                                

In [8]:
# Carga os datos da tenda de San Jose

In [8]:
df = spark.read.parquet("file:///home/hduser/Escritorio/tendas.parquet/tenda=San Jose/")

In [9]:
# Mostra os datos do dataframe

In [9]:
df.show()

+----------+-----+-------------------+------+----------+
|     fecha| hora|          categoria|vendas|      pago|
+----------+-----+-------------------+------+----------+
|2012-05-26|14:03|   Women's Clothing|447.03|      Cash|
|2012-04-04|09:52|  Health and Beauty|360.42|  Discover|
|2012-05-26|14:11|Children's Clothing|121.58|  Discover|
|2012-04-04|09:53|  Health and Beauty| 51.65|      Visa|
|2012-05-26|14:15|     Sporting Goods|  7.11|      Amex|
|2012-04-04|10:00|     Men's Clothing|418.58|  Discover|
|2012-05-26|14:17|        Video Games|453.57|  Discover|
|2012-04-04|10:04|Children's Clothing|386.33|  Discover|
|2012-05-26|14:18|     Men's Clothing|366.41|MasterCard|
|2012-04-04|10:05|             Garden|476.96|      Visa|
|2012-05-26|14:24|  Health and Beauty| 211.2|      Amex|
|2012-04-04|10:05|                CDs|273.51|  Discover|
|2012-05-26|14:25|     Sporting Goods| 215.1|      Amex|
|2012-04-04|10:05|        Video Games|420.93|      Cash|
|2012-05-26|14:25|             

In [10]:
# Crea unha táboa sobre o dataframe dos datos de San Jose

In [13]:
df.createOrReplaceTempView('sanjose')

In [11]:
# Mostra os datos da táboa

In [12]:
spark.sql('SELECT * FROM sanjose').show()

+----------+-----+-------------------+------+----------+
|     fecha| hora|          categoria|vendas|      pago|
+----------+-----+-------------------+------+----------+
|2012-05-26|14:03|   Women's Clothing|447.03|      Cash|
|2012-04-04|09:52|  Health and Beauty|360.42|  Discover|
|2012-05-26|14:11|Children's Clothing|121.58|  Discover|
|2012-04-04|09:53|  Health and Beauty| 51.65|      Visa|
|2012-05-26|14:15|     Sporting Goods|  7.11|      Amex|
|2012-04-04|10:00|     Men's Clothing|418.58|  Discover|
|2012-05-26|14:17|        Video Games|453.57|  Discover|
|2012-04-04|10:04|Children's Clothing|386.33|  Discover|
|2012-05-26|14:18|     Men's Clothing|366.41|MasterCard|
|2012-04-04|10:05|             Garden|476.96|      Visa|
|2012-05-26|14:24|  Health and Beauty| 211.2|      Amex|
|2012-04-04|10:05|                CDs|273.51|  Discover|
|2012-05-26|14:25|     Sporting Goods| 215.1|      Amex|
|2012-04-04|10:05|        Video Games|420.93|      Cash|
|2012-05-26|14:25|             

In [12]:
# Consulta o total de vendas de San José por tipo de elementos (categoría)

In [11]:
spark.sql('SELECT categoria, COUNT(vendas) FROM sanjose GROUP BY categoria').show()

+--------------------+-------------+
|           categoria|count(vendas)|
+--------------------+-------------+
| Children's Clothing|         2173|
|      Sporting Goods|         2180|
|                 CDs|         2325|
|           Computers|         2264|
|Consumer Electronics|         2240|
|   Health and Beauty|         2233|
|        Pet Supplies|         2179|
|                DVDs|         2239|
|                Baby|         2268|
|              Crafts|         2208|
|    Women's Clothing|         2199|
|         Video Games|         2298|
|               Books|         2107|
|               Music|         2220|
|      Men's Clothing|         2229|
|              Garden|         2215|
|             Cameras|         2143|
|                Toys|         2178|
+--------------------+-------------+

