In [2]:
!pip install findspark
!pip install pyspark
!pip install spark



In [3]:
import pyspark
import findspark
import spark

findspark.init()

from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

In [51]:
avg_speed = spark.read.option('header','true').option('inferSchema','true').csv('avg_speed.csv')
prices = spark.read.option('header','true').option('inferSchema','true').csv('prices_2022.csv')
users = spark.read.option('header','true').option('inferSchema','true').csv('users.csv')

In [52]:
def limpiar_columnas(prices):
  nor_simbolos = ['__','.', '(', ')']
  prices_columns = prices.columns
  prices_col_Nsim = [column.lower().replace(' ','_').replace('–','') for column in prices_columns]

  prices_clean = []

  for columna in prices_col_Nsim:
    if columna.endswith('_'):
      new_wrod = columna[:len(columna)-1]
    else:
      new_wrod = columna

    for simbolo in nor_simbolos:
      if simbolo in columna:
        new_wrod = new_wrod.replace(simbolo, '')

    prices_clean.append(new_wrod)

  return prices_clean

In [53]:
avg_speed.printSchema()
speedDF = avg_speed.toDF(*limpiar_columnas(avg_speed))
speedDF.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Avg : double (nullable = true)

root
 |-- country: string (nullable = true)
 |-- avg: double (nullable = true)



In [54]:
pricesDF = prices.toDF(*limpiar_columnas(prices))
pricesDF.printSchema()

root
 |-- country_code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- continental_region: string (nullable = true)
 |-- no_of_internet_plans: integer (nullable = true)
 |-- average_price_of_1gb_usd: string (nullable = true)
 |-- cheapest_1gb_for_30_days_usd: string (nullable = true)
 |-- most_expensive_1gb_usd: string (nullable = true)
 |-- average_price_of_1gb_usdat_the_start_of_2021: string (nullable = true)
 |-- average_price_of_1gb_usdat_start_of_2020: string (nullable = true)



In [55]:
users.printSchema()
userDF = users.toDF(*limpiar_columnas(users))

'''@udf(returnType=IntegerType())
def del_comma(s):
  return int(s.replace(',',''))

userDF = userDF.select(
    'country_or_area',
    'subregion',
    'region',
    del_comma('internet_users').alias('internet_users'),
    del_comma('population').alias('population')
)'''

root
 |-- Country or area: string (nullable = true)
 |-- Subregion: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Internet users: string (nullable = true)
 |-- Population: string (nullable = true)



"@udf(returnType=IntegerType())\ndef del_comma(s):\n  return int(s.replace(',',''))\n\nuserDF = userDF.select(\n    'country_or_area', \n    'subregion', \n    'region', \n    del_comma('internet_users').alias('internet_users'), \n    del_comma('population').alias('population')\n)"

In [46]:
'''
Determine los cinco países con mayor número de usuarios de Internet en la región de América.
La salida debe contener el nombre del país, la región, la subregión y la cantidad de usuarios de Internet.
'''

userDF.select(
    'country_or_area','subregion','region',
    regexp_replace(col('internet_users'), ',', '').cast('int').alias('internet_users')
).select('country_or_area','subregion','region', 'internet_users').\
filter(col('region') == 'Americas').\
orderBy(desc('internet_users')).\
limit(5).\
show()

+---------------+----------------+--------+--------------+
|country_or_area|       subregion|  region|internet_users|
+---------------+----------------+--------+--------------+
|  United States|Northern America|Americas|     312320000|
|         Brazil|   South America|Americas|     160010801|
|         Mexico|Northern America|Americas|      92010000|
|         Canada|Northern America|Americas|      33950632|
|      Argentina|   South America|Americas|      33561876|
+---------------+----------------+--------+--------------+



In [47]:
'''
Obtenga el top tres de las regiones con más usuarios de internet.
'''

userDF.groupBy('region').agg(sum('internet_users').alias('usuarios')).orderBy(desc('usuarios')).limit(3).show()

+--------+----------+
|  region|  usuarios|
+--------+----------+
|    Asia|2992777999|
|Americas| 779914800|
|  Europe| 712066624|
+--------+----------+



In [49]:
from pyspark.sql.window import Window
'''
Obtenga el país con más usuarios de Internet por región y subregión
Por ejemplo, el resultado para la región de las Américas y la subregión Norte América debería ser Estados Unidos.
La salida debe contener el nombre del país con más usuarios de Internet, la región, la subregión y la cantidad de usuarios de Internet.
Además, la salida debe estar ordenada de mayor a menor atendiendo a la cantidad de usuarios de Internet de cada país.
'''

userXregionXsregion = Window.partitionBy('subregion', 'subregion').orderBy(desc('internet_users'))

top_paises = userDF.withColumn('rn', row_number().over(userXregionXsregion)).\
select(
    'country_or_area',
    'region',
    'subregion',
    'internet_users',
).\
filter(col('rn') == 1).\
orderBy(desc('internet_users'))

In [151]:
!pip install avro

Collecting avro
  Downloading avro-1.11.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m61.4/85.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.8/85.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: avro
  Building wheel for avro (pyproject.toml) ... [?25l[?25hdone
  Created wheel for avro: filename=avro-1.11.2-py2.py3-none-any.whl size=119738 sha256=dd66329c8545f05e0e5f520c86f0c1a723ddbe340ec958cd8aef52112d0bdddf
  Stored in directory: /root/.cache/pip/wheels/1f/8b/f2/c2659be6a948e76dd2f89adad8ae4541710dce63ac9969177a
Successfully built avro
In

In [None]:
top_paises.repartition(3).write.mode('overwrite').partitionBy('region').parquet('FileStore/ProyectoFinal/salida')

In [57]:
userDF.join(speedDF, speedDF['country'] == userDF['country_or_area']).\
select(
    'country',
    'region',
    'subregion',
    'population',
    regexp_replace(col('internet_users'), ',', '').cast('int').alias('internet_users'),
    'avg'
).\
select(
    'country',
    'region',
    'subregion',
    'population',
    'internet_users',
    'avg'
).\
filter(col('population').isNotNull()).\
filter(col('internet_users').isNotNull()).\
filter(col('avg').isNotNull()).\
orderBy(desc('avg')).\
limit(10).\
show()

+--------------------+------+---------------+----------+--------------+------+
|             country|region|      subregion|population|internet_users|   avg|
+--------------------+------+---------------+----------+--------------+------+
|United Arab Emirates|  Asia|   Western Asia| 9,630,959|       8913217|135.35|
|              Norway|Europe|Northern Europe| 5,337,962|       5120225|134.73|
|               Qatar|  Asia|   Western Asia| 2,781,682|       2532059|120.69|
|         South Korea|  Asia|   Eastern Asia|51,171,706|      49421084|117.95|
|         Netherlands|Europe| Western Europe|17,059,560|      15877494|108.33|
|             Denmark|Europe|Northern Europe| 5,752,126|       5407278|105.65|
|        Saudi Arabia|  Asia|   Western Asia|33,702,756|      27048861|102.79|
|              Kuwait|  Asia|   Western Asia| 4,137,312|       4053797| 96.23|
|            Bulgaria|Europe| Eastern Europe| 7,051,608|       4492326| 87.51|
|             Croatia|Europe|Southern Europe| 4,156,

In [195]:
pricesDF.printSchema()

root
 |-- country_code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- continental_region: string (nullable = true)
 |-- no_of_internet_plans: integer (nullable = true)
 |-- average_price_of_1gb_usd: string (nullable = true)
 |-- cheapest_1gb_for_30_days_usd: string (nullable = true)
 |-- most_expensive_1gb_usd: string (nullable = true)
 |-- average_price_of_1gb_usdat_the_start_of_2021: string (nullable = true)
 |-- average_price_of_1gb_usdat_start_of_2020: string (nullable = true)



In [58]:
pricesDF.join(userDF, userDF.country_or_area == pricesDF.name).\
select(
    'region',
    regexp_extract('average_price_of_1gb_usdat_the_start_of_2021', r'([0-9]+).([0-9]+)', 0).alias('costo_prom_1_gb')
).\
select(
    'region',
    'costo_prom_1_gb',
    when(col('region').startswith('A').cast(StringType()) == 'true', 'region_a').\
    when(col('region').startswith('E').cast(StringType()) == 'true', 'region_e').\
    otherwise('region_por_defecto').\
    alias('grupo_region')
).\
orderBy(desc('costo_prom_1_gb')).\
limit(10).\
show()

+--------+---------------+------------------+
|  region|costo_prom_1_gb|      grupo_region|
+--------+---------------+------------------+
|Americas|           9.56|          region_a|
|  Europe|           9.54|          region_e|
|Americas|           9.32|          region_a|
|  Africa|           8.81|          region_a|
| Oceania|           8.53|region_por_defecto|
|  Europe|           8.38|          region_e|
|Americas|           8.33|          region_a|
|  Africa|           8.25|          region_a|
|Americas|           8.00|          region_a|
|  Europe|           7.95|          region_e|
+--------+---------------+------------------+

