### Dataframes, Spark SQL e Parquet - practica

In [1]:
# Carga o ficheiro purchases.txt desde o sistema de ficheiros localabs

In [15]:
compras = spark.read.csv('/user/hduser/purchases.txt',sep='\t')
compras = compras.toDF('date','time','store','category','sales','mp')

In [2]:
# Crea unha táboa sobre o dataframe para poder realizar consultas SQL

In [17]:
compras.createOrReplaceTempView('purchases')

In [3]:
# Mostra os datos do dataframe

In [16]:
compras.show()

+----------+-----+--------------+--------------------+------+----------+
|      date| time|         store|            category| sales|        mp|
+----------+-----+--------------+--------------------+------+----------+
|2012-01-01|09:00|      San Jose|      Men's Clothing|214.05|      Amex|
|2012-01-01|09:00|    Fort Worth|    Women's Clothing|153.57|      Visa|
|2012-01-01|09:00|     San Diego|               Music| 66.08|      Cash|
|2012-01-01|09:00|    Pittsburgh|        Pet Supplies|493.51|  Discover|
|2012-01-01|09:00|         Omaha| Children's Clothing|235.63|MasterCard|
|2012-01-01|09:00|      Stockton|      Men's Clothing|247.18|MasterCard|
|2012-01-01|09:00|        Austin|             Cameras| 379.6|      Visa|
|2012-01-01|09:00|      New York|Consumer Electronics| 296.8|      Cash|
|2012-01-01|09:00|Corpus Christi|                Toys| 25.38|  Discover|
|2012-01-01|09:00|    Fort Worth|                Toys|213.88|      Visa|
|2012-01-01|09:00|     Las Vegas|         Video Gam

In [4]:
# Mostra os datos da táboa cunha consulta SQL

In [18]:
spark.sql('SELECT * FROM purchases').show()

+----------+-----+--------------+--------------------+------+----------+
|      date| time|         store|            category| sales|        mp|
+----------+-----+--------------+--------------------+------+----------+
|2012-01-01|09:00|      San Jose|      Men's Clothing|214.05|      Amex|
|2012-01-01|09:00|    Fort Worth|    Women's Clothing|153.57|      Visa|
|2012-01-01|09:00|     San Diego|               Music| 66.08|      Cash|
|2012-01-01|09:00|    Pittsburgh|        Pet Supplies|493.51|  Discover|
|2012-01-01|09:00|         Omaha| Children's Clothing|235.63|MasterCard|
|2012-01-01|09:00|      Stockton|      Men's Clothing|247.18|MasterCard|
|2012-01-01|09:00|        Austin|             Cameras| 379.6|      Visa|
|2012-01-01|09:00|      New York|Consumer Electronics| 296.8|      Cash|
|2012-01-01|09:00|Corpus Christi|                Toys| 25.38|  Discover|
|2012-01-01|09:00|    Fort Worth|                Toys|213.88|      Visa|
|2012-01-01|09:00|     Las Vegas|         Video Gam

In [5]:
# Conta o número de vendas por tenda

In [22]:
spark.sql('SELECT store,SUM(sales) FROM purchases GROUP BY store').show()



+---------------+--------------------+
|          store|          sum(sales)|
+---------------+--------------------+
|North Las Vegas|1.0029652509999933E7|
|        Phoenix|1.0079076700000003E7|
|          Omaha| 1.002664233999998E7|
|      Anchorage|   9933500.399999995|
|        Anaheim|1.0076416359999968E7|
|     Greensboro|1.0033781390000068E7|
|         Dallas|1.0066548450000022E7|
|        Oakland|   9947292.520000033|
|         Laredo|1.0144604979999993E7|
|     Scottsdale|1.0037929849999957E7|
|    San Antonio|1.0014441700000005E7|
|    Bakersfield|1.0031208919999965E7|
|        Raleigh|1.0061442539999988E7|
|    Chula Vista|   9974951.340000018|
|   Philadelphia|1.0190080259999951E7|
|     Louisville|1.0008566470000014E7|
|    Los Angeles|1.0084576799999982E7|
|       Chandler|   9919559.859999968|
|     Sacramento|1.0123468179999966E7|
|   Indianapolis|1.0090272770000013E7|
+---------------+--------------------+
only showing top 20 rows



                                                                                

In [6]:
# Fai un total de vendas por categoría

In [20]:
spark.sql('SELECT category,SUM(sales) FROM purchases GROUP BY category').show()



+--------------------+--------------------+
|            category|          sum(sales)|
+--------------------+--------------------+
| Children's Clothing| 5.762482093999994E7|
|      Sporting Goods| 5.759908588999996E7|
|                 CDs| 5.741075303999995E7|
|           Computers| 5.731540632000032E7|
|Consumer Electronics|5.7452374130000055E7|
|   Health and Beauty|5.7481589560001045E7|
|        Pet Supplies| 5.719725023999971E7|
|                DVDs| 5.764921214000037E7|
|                Baby|5.7491808440000996E7|
|              Crafts| 5.741815449999973E7|
|    Women's Clothing| 5.743444896999931E7|
|         Video Games| 5.751316558000001E7|
|               Books| 5.745075790999974E7|
|               Music| 5.749548970000029E7|
|      Men's Clothing| 5.762127904000029E7|
|             Cameras| 5.729904664000106E7|
|              Garden| 5.753983310999994E7|
|                Toys| 5.746347710999978E7|
+--------------------+--------------------+



                                                                                

In [7]:
# Garda o dataframe en HDFS en formato parquet, con particións para as diferentes tendas

In [36]:
compras.write.partitionBy("store").mode("overwrite").parquet("purchases2.parquet")

                                                                                

In [8]:
# Carga os datos da tenda de San Jose

In [37]:
df_parquet_partido = spark.read.parquet("purchases2.parquet/store=San Jose")

In [9]:
# Mostra os datos do dataframe

In [38]:
df_parquet_partido.show()

+----------+-----+--------------------+------+----------+
|      date| time|            category| sales|        mp|
+----------+-----+--------------------+------+----------+
|2012-04-16|09:22|                 CDs| 94.52|  Discover|
|2012-01-01|09:00|      Men's Clothing|214.05|      Amex|
|2012-04-16|09:29|Consumer Electronics|381.52|MasterCard|
|2012-01-01|09:00|    Women's Clothing|215.82|      Cash|
|2012-04-16|09:36|      Sporting Goods|145.74|      Amex|
|2012-01-01|09:09|                Toys|337.71|      Cash|
|2012-04-16|09:37|                DVDs|212.34|      Amex|
|2012-01-01|09:17|              Garden|192.82|      Cash|
|2012-04-16|09:51| Children's Clothing|105.58|MasterCard|
|2012-01-01|09:19|             Cameras| 95.81|      Cash|
|2012-04-16|09:53|           Computers|307.14|      Cash|
|2012-01-01|09:19|        Pet Supplies|253.33|  Discover|
|2012-04-16|09:55|                Baby| 141.5|      Visa|
|2012-01-01|09:20|           Computers| 160.6|      Amex|
|2012-04-16|09

In [10]:
# Crea unha táboa sobre o dataframe dos datos de San Jose

In [39]:
df_parquet_partido.createOrReplaceTempView('purchases_SanJose')

In [11]:
# Mostra os datos da táboa

In [41]:
spark.sql('SELECT * FROM purchases_SanJose').show()

+----------+-----+--------------------+------+----------+
|      date| time|            category| sales|        mp|
+----------+-----+--------------------+------+----------+
|2012-04-16|09:22|                 CDs| 94.52|  Discover|
|2012-01-01|09:00|      Men's Clothing|214.05|      Amex|
|2012-04-16|09:29|Consumer Electronics|381.52|MasterCard|
|2012-01-01|09:00|    Women's Clothing|215.82|      Cash|
|2012-04-16|09:36|      Sporting Goods|145.74|      Amex|
|2012-01-01|09:09|                Toys|337.71|      Cash|
|2012-04-16|09:37|                DVDs|212.34|      Amex|
|2012-01-01|09:17|              Garden|192.82|      Cash|
|2012-04-16|09:51| Children's Clothing|105.58|MasterCard|
|2012-01-01|09:19|             Cameras| 95.81|      Cash|
|2012-04-16|09:53|           Computers|307.14|      Cash|
|2012-01-01|09:19|        Pet Supplies|253.33|  Discover|
|2012-04-16|09:55|                Baby| 141.5|      Visa|
|2012-01-01|09:20|           Computers| 160.6|      Amex|
|2012-04-16|09

In [12]:
# Consulta o total de vendas de San José por tipo de elementos (categoría)

In [42]:
spark.sql('SELECT category,SUM(sales) FROM purchases_SanJose GROUP BY category').show()

+--------------------+------------------+
|            category|        sum(sales)|
+--------------------+------------------+
| Children's Clothing| 549454.2400000003|
|      Sporting Goods| 540678.4100000004|
|                 CDs| 586499.6699999999|
|           Computers| 561784.6999999997|
|Consumer Electronics| 556462.0700000002|
|   Health and Beauty| 553600.1900000004|
|        Pet Supplies| 540508.1199999999|
|                DVDs| 569417.1900000004|
|                Baby|         566853.52|
|              Crafts|         548947.01|
|    Women's Clothing| 540482.1599999997|
|         Video Games| 573047.9199999999|
|               Books|522989.79999999993|
|               Music| 550085.7599999993|
|      Men's Clothing| 551148.6200000005|
|              Garden| 559343.9099999995|
|             Cameras|         527568.34|
|                Toys|         537849.78|
+--------------------+------------------+

