# SPARK CONTEXT Y SPARKSQL CONTEXT

In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')
sc

<pyspark.context.SparkContext at 0x7f79a16bf278>

In [2]:
sqlContext = pyspark.sql.SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x7f796ce5feb8>

# CARGA DE DATASETS

In [3]:
winter = sqlContext.read.format("com.databricks.spark.csv").option("header", "true")\
                .option("inferschema", "true").option("mode","DROPMALFORMED").option("delimiter", ",")\
                .load("data/winter.csv")

In [4]:
summer = sqlContext.read.format("com.databricks.spark.csv").option("header", "true")\
                .option("inferschema", "true").option("mode","DROPMALFORMED").option("delimiter", ",")\
                .load("data/summer.csv")

In [100]:
countries = sqlContext.read.format("com.databricks.spark.csv").option("header", "true")\
                .option("inferschema", "true").option("mode","DROPMALFORMED").option("delimiter", ",")\
                .load("data/dictionary.csv")

In [6]:
winter.dtypes

[('Year', 'int'),
 ('City', 'string'),
 ('Sport', 'string'),
 ('Discipline', 'string'),
 ('Athlete', 'string'),
 ('Country', 'string'),
 ('Gender', 'string'),
 ('Event', 'string'),
 ('Medal', 'string')]

In [7]:
summer.dtypes

[('Year', 'int'),
 ('City', 'string'),
 ('Sport', 'string'),
 ('Discipline', 'string'),
 ('Athlete', 'string'),
 ('Country', 'string'),
 ('Gender', 'string'),
 ('Event', 'string'),
 ('Medal', 'string')]

In [101]:
countries.dtypes

[('Name', 'string'),
 ('Code', 'string'),
 ('Population', 'int'),
 ('GDP', 'double')]

In [12]:
print(winter.count())
print(summer.count())
print(countries.count())

5770
31165
201


In [58]:
from pyspark.sql.functions import lit
winter = winter.withColumn('type', lit(1))
summer = summer.withColumn('type', lit(2))

# UNIR LOS DOS DATAFRAMES

In [59]:
olympics = winter.union(summer)
olympics.count()

36935

In [60]:
olympics.show()

+----+--------+---------+----------+--------------------+-------+------+---------------+------+----+
|Year|    City|    Sport|Discipline|             Athlete|Country|Gender|          Event| Medal|type|
+----+--------+---------+----------+--------------------+-------+------+---------------+------+----+
|1924|Chamonix| Biathlon|  Biathlon|         BERTHET, G.|    FRA|   Men|Military Patrol|Bronze|   1|
|1924|Chamonix| Biathlon|  Biathlon|      MANDRILLON, C.|    FRA|   Men|Military Patrol|Bronze|   1|
|1924|Chamonix| Biathlon|  Biathlon| MANDRILLON, Maurice|    FRA|   Men|Military Patrol|Bronze|   1|
|1924|Chamonix| Biathlon|  Biathlon|     VANDELLE, André|    FRA|   Men|Military Patrol|Bronze|   1|
|1924|Chamonix| Biathlon|  Biathlon|AUFDENBLATTEN, Adolf|    SUI|   Men|Military Patrol|  Gold|   1|
|1924|Chamonix| Biathlon|  Biathlon|     JULEN, Alphonse|    SUI|   Men|Military Patrol|  Gold|   1|
|1924|Chamonix| Biathlon|  Biathlon|         JULEN, Ant.|    SUI|   Men|Military Patrol|  G

# ANALISIS DE LOS DATOS

# Obtener el total de medallas de Francia en los juegos Olímpicos

In [61]:
olympics.where("Country == 'FRA'").count()

1548

In [62]:
import pandas as pd
ol_pd = olympics.toPandas()

In [63]:
ol_pd.Sport.unique()

array(['Biathlon', 'Bobsleigh', 'Curling', 'Ice Hockey', 'Skating',
       'Skiing', 'Luge', 'Aquatics', 'Athletics', 'Cycling', 'Fencing',
       'Gymnastics', 'Shooting', 'Tennis', 'Weightlifting', 'Wrestling',
       'Archery', 'Basque Pelota', 'Cricket', 'Croquet', 'Equestrian',
       'Football', 'Golf', 'Polo', 'Rowing', 'Rugby', 'Sailing',
       'Tug of War', 'Boxing', 'Lacrosse', 'Roque', 'Hockey',
       'Jeu de paume', 'Rackets', 'Water Motorsports', 'Modern Pentathlon',
       'Basketball', 'Canoe / Kayak', 'Handball', 'Judo', 'Volleyball',
       'Table Tennis', 'Badminton', 'Baseball', 'Softball', 'Taekwondo',
       'Triathlon', 'Canoe'], dtype=object)

# Obtener el total de medallas de Rusia en los juegos Olímpicos separadas por metal

In [64]:
olympics.where("Country == 'RUS'").groupby('Medal').count().show()

+------+-----+
| Medal|count|
+------+-----+
|Silver|  328|
|  Gold|  333|
|Bronze|  370|
+------+-----+



# Obtener el podio masculino de la prueba de 1500M de Barcelona '92

In [65]:
olympics.filter('Year == 1992').filter("Sport == 'Athletics'")\
        .filter("Event == '1500M'").filter("Gender ='Men'").show()

+----+---------+---------+----------+--------------------+-------+------+-----+------+----+
|Year|     City|    Sport|Discipline|             Athlete|Country|Gender|Event| Medal|type|
+----+---------+---------+----------+--------------------+-------+------+-----+------+----+
|1992|Barcelona|Athletics| Athletics|SULAIMAN, Mohamed...|    QAT|   Men|1500M|Bronze|   2|
|1992|Barcelona|Athletics| Athletics|  CACHO RUIZ, Fermin|    ESP|   Men|1500M|  Gold|   2|
|1992|Barcelona|Athletics| Athletics|    EL BASIR, Rachid|    MAR|   Men|1500M|Silver|   2|
+----+---------+---------+----------+--------------------+-------+------+-----+------+----+



# Obtener el número de medallas obtenidas por las competidoras de USA en todas las modalidades de Natación en Londres 2012 por metales

In [66]:
olympics.filter('Year == 2012').filter("Sport == 'Aquatics'")\
        .filter("Gender ='Women'").filter("Country == 'USA'").groupby('Medal').count().show()

+------+-----+
| Medal|count|
+------+-----+
|Silver|    6|
|  Gold|   33|
|Bronze|    8|
+------+-----+



# Obtener el país con mayor número de medallas de cualquier tipo en juegos olímpicos de verano antes de 1980

In [69]:
olympics.filter('Year <= 1980').filter('type == 2').groupby('Country').count().sort('count',ascending=False).show()

+-------+-----+
|Country|count|
+-------+-----+
|    USA| 2498|
|    URS| 1755|
|    GBR| 1204|
|    FRA|  914|
|    SWE|  856|
|    ITA|  810|
|    HUN|  801|
|    GDR|  651|
|    GER|  520|
|    FIN|  412|
|    NOR|  389|
|    BEL|  377|
|    NED|  375|
|    POL|  359|
|    DEN|  359|
|    JPN|  357|
|    AUS|  313|
|    TCH|  311|
|    SUI|  309|
|    CAN|  308|
+-------+-----+
only showing top 20 rows



# Mostrar todas las medallas ganadas por la gimnasta rumana Nadia Comaneci

In [73]:
olympics.filter("Athlete == 'COMANECI, Nadia'").show()

+----+--------+----------+-----------+---------------+-------+------+--------------------+------+----+
|Year|    City|     Sport| Discipline|        Athlete|Country|Gender|               Event| Medal|type|
+----+--------+----------+-----------+---------------+-------+------+--------------------+------+----+
|1976|Montreal|Gymnastics|Artistic G.|COMANECI, Nadia|    ROU| Women|        Balance Beam|  Gold|   2|
|1976|Montreal|Gymnastics|Artistic G.|COMANECI, Nadia|    ROU| Women|     Floor Exercises|Bronze|   2|
|1976|Montreal|Gymnastics|Artistic G.|COMANECI, Nadia|    ROU| Women|Individual All-Round|  Gold|   2|
|1976|Montreal|Gymnastics|Artistic G.|COMANECI, Nadia|    ROU| Women|    Team Competition|Silver|   2|
|1976|Montreal|Gymnastics|Artistic G.|COMANECI, Nadia|    ROU| Women|         Uneven Bars|  Gold|   2|
|1980|  Moscow|Gymnastics|Artistic G.|COMANECI, Nadia|    ROU| Women|        Balance Beam|  Gold|   2|
|1980|  Moscow|Gymnastics|Artistic G.|COMANECI, Nadia|    ROU| Women|    

# Mostrar todas las medallas de oro ganadas por el atleta jamaicano Usain Bolt

In [76]:
olympics.filter("Athlete == 'BOLT, Usain'").filter("Medal == 'Gold'").show()

+----+-------+---------+----------+-----------+-------+------+------------+-----+----+
|Year|   City|    Sport|Discipline|    Athlete|Country|Gender|       Event|Medal|type|
+----+-------+---------+----------+-----------+-------+------+------------+-----+----+
|2008|Beijing|Athletics| Athletics|BOLT, Usain|    JAM|   Men|        100M| Gold|   2|
|2008|Beijing|Athletics| Athletics|BOLT, Usain|    JAM|   Men|        200M| Gold|   2|
|2008|Beijing|Athletics| Athletics|BOLT, Usain|    JAM|   Men|4X100M Relay| Gold|   2|
|2012| London|Athletics| Athletics|BOLT, Usain|    JAM|   Men|        100M| Gold|   2|
|2012| London|Athletics| Athletics|BOLT, Usain|    JAM|   Men|        200M| Gold|   2|
|2012| London|Athletics| Athletics|BOLT, Usain|    JAM|   Men|4X100M Relay| Gold|   2|
+----+-------+---------+----------+-----------+-------+------+------------+-----+----+



# OBTENER LOS PRIMEROS PAÍSES EN RATIO DE MEDALLAS/POBLACIÓN Y MEDALLAS/GDP

In [102]:
medals_by_country = olympics.groupby('Country').count()
medals_by_country.show()

+-------+-----+
|Country|count|
+-------+-----+
|    POL|  538|
|    JAM|  127|
|    BRA|  431|
|    ARM|   11|
|    MOZ|    2|
|    CUB|  410|
|    FRA| 1548|
|    ALG|   15|
|    BRN|    1|
|    VIE|    2|
|    BOT|    1|
|    EUA|  281|
|    RSA|  106|
|    ETH|   45|
|    ITA| 1488|
|    UKR|  184|
|    GHA|   16|
|    CMR|   23|
|    SCG|   14|
|    SEN|    1|
+-------+-----+
only showing top 20 rows



In [105]:
df = medals_by_country.join(countries, medals_by_country.Country == countries.Code)   
df = df.drop('Code').withColumnRenamed('count','total')

In [108]:
df.show()

+-------+-----+--------------------+----------+----------------+
|Country|total|                Name|Population|             GDP|
+-------+-----+--------------------+----------+----------------+
|    POL|  538|              Poland|  37999494|12554.5475536313|
|    JAM|  127|             Jamaica|   2725941|5232.02458271187|
|    BRA|  431|              Brazil| 207847528| 8538.5899749574|
|    ARM|   11|             Armenia|   3017712|3489.12768956995|
|    MOZ|    2|          Mozambique|  27977863|529.242556068943|
|    CUB|  410|                Cuba|  11389562|            null|
|    FRA| 1548|              France|  66808385|36205.5681017036|
|    ALG|   15|             Algeria|  39666519|4206.03123244958|
|    BRN|    1|             Bahrain|   1377237|22600.2140981035|
|    VIE|    2|             Vietnam|  91703800|2111.13802366815|
|    BOT|    1|            Botswana|   2262485|6360.13822018837|
|    RSA|  106|        South Africa|  54956920|5723.97335690212|
|    ETH|   45|          

In [109]:
df = df.withColumn('ratio_GDP', df.total/df.GDP).withColumn('ratio_Pop', df.total/df.Population)

In [113]:
df.dropna().sort('ratio_GDP',ascending=False).show()

+-------+-----+--------------+----------+----------------+--------------------+--------------------+
|Country|total|          Name|Population|             GDP|           ratio_GDP|           ratio_Pop|
+-------+-----+--------------+----------+----------------+--------------------+--------------------+
|    IND|  184|         India|1311050527|1598.25903421916| 0.11512526822030099|1.403454681651640...|
|    RUS| 1031|        Russia| 144096812|9092.58053606884| 0.11338915238750813|7.154911935178691E-6|
|    CHN|  889|         China|1371220000|8027.68381013907| 0.11074178069609335|6.483277665144907E-7|
|    USA| 5238| United States| 321418820|56115.7184261955| 0.09334283061686399|1.629649439942564...|
|    HUN| 1091|       Hungary|   9844686|12363.5434596539|  0.0882433101448847|1.108212085179761E-4|
|    UKR|  184|       Ukraine|  45198200|2114.95471628444| 0.08699949865747095|4.070958577996469E-6|
|    PAK|  121|      Pakistan| 188924874|1434.69666504969| 0.08433838521246527|6.4046622045

In [114]:
df.dropna().sort('ratio_Pop',ascending=False).show()

+-------+-----+--------------+----------+----------------+--------------------+--------------------+
|Country|total|          Name|Population|             GDP|           ratio_GDP|           ratio_Pop|
+-------+-----+--------------+----------+----------------+--------------------+--------------------+
|    NOR| 1011|        Norway|   5195921|74400.3697770928|0.013588642140207182|1.945757065975406...|
|    FIN|  890|       Finland|   5482013|42311.0362306446| 0.02103470109189621|1.623491224847515E-4|
|    SWE| 1477|        Sweden|   9798871|50579.6736486777| 0.02920145373533095|1.507316506156678...|
|    HUN| 1091|       Hungary|   9844686|12363.5434596539|  0.0882433101448847|1.108212085179761E-4|
|    DEN|  512|       Denmark|   5676002|51989.2934712354|0.009848181535363218|9.020433748966262E-5|
|    SUI|  665|   Switzerland|   8286976|80945.0792194742|0.008215446898222453|8.024640109974977E-5|
|    BAH|   27|       Bahamas|    388019|22817.2308572518|0.001183316247660...|6.9584221391