## Vamos a crear un dataframe y abrir la sesion

In [2]:
import findspark 
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark: SparkSession = SparkSession.builder \
    .appName('ProyectoFinal_SparkSQL') \
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .config('spark.sql.repl.eagerEval.maxNumRows', 8) \
    .getOrCreate()

In [5]:
spark

In [7]:
# Descargamos el .csv del link proporcionado y tras subir a spark el .csv que acabamos de descargar

## Leer datos

In [6]:
path = 'file:////home/training/Datos/'

In [7]:
df1 = spark.read.csv(path + "Covid", header = True)
df1

                                                                                

dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
14/12/2020,14,12,2020,746,6,Afghanistan,AF,AFG,38041757,Asia,9.01377925
13/12/2020,13,12,2020,298,9,Afghanistan,AF,AFG,38041757,Asia,7.05277624
12/12/2020,12,12,2020,113,11,Afghanistan,AF,AFG,38041757,Asia,6.86876792
11/12/2020,11,12,2020,63,10,Afghanistan,AF,AFG,38041757,Asia,7.13426564
10/12/2020,10,12,2020,202,16,Afghanistan,AF,AFG,38041757,Asia,6.96865815
09/12/2020,9,12,2020,135,13,Afghanistan,AF,AFG,38041757,Asia,6.96340077
08/12/2020,8,12,2020,200,6,Afghanistan,AF,AFG,38041757,Asia,7.09483529
07/12/2020,7,12,2020,210,26,Afghanistan,AF,AFG,38041757,Asia,7.21575505


In [10]:
df1.show(vertical = True, truncate = False)

-RECORD 0-----------------------------------------------------------------
 dateRep                                                    | 14/12/2020  
 day                                                        | 14          
 month                                                      | 12          
 year                                                       | 2020        
 cases                                                      | 746         
 deaths                                                     | 6           
 countriesAndTerritories                                    | Afghanistan 
 geoId                                                      | AF          
 countryterritoryCode                                       | AFG         
 popData2019                                                | 38041757    
 continentExp                                               | Asia        
 Cumulative_number_for_14_days_of_COVID-19_cases_per_100000 | 9.01377925  
-RECORD 1----------------

## ¿Cuántas muertes se produjeron en Afghanistán??

In [20]:
from pyspark.sql.functions import max, count

In [47]:
df1.select('countriesAndTerritories').filter(df1['countriesAndTerritories'] == 'Afghanistan').groupBy('countriesAndTerritories').agg(count("*").alias('total_deaths'))

                                                                                

countriesAndTerritories,total_deaths
Afghanistan,340


## ¿En qué paises se dieron casos de Covid el dia 5 de diciembre de 2020?

In [51]:
df1.select('countriesAndTerritories', 'day', 'month', 'year').filter(df1['day'] == 5).filter(df1['month'] == 12).filter(df1['year'] == 2020)

countriesAndTerritories,day,month,year
Afghanistan,5,12,2020
Albania,5,12,2020
Algeria,5,12,2020
Andorra,5,12,2020
Angola,5,12,2020
Anguilla,5,12,2020
Antigua_and_Barbuda,5,12,2020
Argentina,5,12,2020


## ¿En qué paises se dieron mas muertes por Covid (ordenado de mayor a menor)?

In [54]:
df1.createOrReplaceTempView('PaisesAfectados')

In [86]:
df1 = spark.sql("""
    select countriesAndTerritories as country, sum(deaths) as total_deaths
    from PaisesAfectados
    group by countriesAndTerritories
    order by total_deaths desc
""")
df1

                                                                                

country,total_deaths
United_States_of_...,299177.0
Brazil,181402.0
India,143355.0
Mexico,113953.0
Italy,64520.0
United_Kingdom,64170.0
France,57911.0
Iran,52196.0


### Guardar un Dataframe en Data lake en formato Parquet para leerlo de nuevo

In [87]:
df1.write.format('parquet').mode('overwrite') \
    .save("file:////home/training/output_Data/proyecto")
df1

                                                                                

country,total_deaths
United_States_of_...,299177.0
Brazil,181402.0
India,143355.0
Mexico,113953.0
Italy,64520.0
United_Kingdom,64170.0
France,57911.0
Iran,52196.0
