In [1]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Exemplo Básico") \
    .getOrCreate()

In [2]:
spark

In [105]:
path = "df2.csv"

from pyspark.sql.types import (StringType, DoubleType, 
IntegerType, StructType, StructField, FloatType)

columns = StructType([
    StructField('index',IntegerType(),True),
    StructField('appname',StringType(),True),
    StructField('appid',StringType(),True),
    StructField('category',StringType(),True),
    StructField('rat_x',FloatType(),True),
    StructField('installs',StringType(),True),
    StructField('free',StringType(),True),
    StructField('review',StringType(),True),
    StructField('rat_y',FloatType(),True),
    StructField('score',StringType(),True),
    StructField('country',StringType(),True)

])

df = spark.read.csv(path, columns, sep='\t')

In [106]:
df

DataFrame[index: int, appname: string, appid: string, category: string, rat_x: float, installs: string, free: string, review: string, rat_y: float, score: string, country: string]

In [107]:
#1. Número total de valoracións
df.count()

                                                                                

3129137

In [108]:
#2. Número total de valoracións por categoría
ex2 = df.groupby('category').count()
ex2.show()



+---------------+------+
|       category| count|
+---------------+------+
|  Music & Audio|154928|
|      Education|241090|
|         Trivia| 11795|
|Auto & Vehicles| 18280|
|             cl|  4846|
|  Entertainment|151910|
|      Adventure| 23203|
|         Arcade| 60882|
|         Sports| 47483|
| Travel & Local| 67288|
|   Food & Drink| 74382|
|           null| 22904|
|   Role Playing| 10034|
|        Finance|137037|
|Personalization| 89277|
|         Racing| 10362|
|          Tools|170296|
|             ec|  3156|
|    Educational| 21308|
|             co|  3814|
+---------------+------+
only showing top 20 rows



                                                                                

In [110]:
#Número total de valoracións por categoría en LA

df2 = (df
               .where(df.country != 'null')
               .select(df.category)
               )
df2.groupby('category').count().show()



+--------------------+------+
|            category| count|
+--------------------+------+
|       Music & Audio|    23|
|       Entertainment| 12248|
|        Food & Drink|   407|
|             Finance| 65035|
|              Social|242328|
|            Shopping|  7432|
|        Productivity|  5549|
|           Lifestyle|  5793|
|              Casual| 16065|
|       Communication|212750|
|            Strategy|  8146|
|              Action|   332|
|              Arcade|  6348|
|               Tools| 24464|
|            Business| 25551|
|Video Players & E...| 57859|
|              Puzzle|   171|
|   Maps & Navigation|  1848|
|     Personalization|    66|
|    Health & Fitness|   254|
+--------------------+------+
only showing top 20 rows



                                                                                

In [111]:
#Que país comenta máis en LA

df2 = (df
               .where(df.country != 'null')
               .select(df.country)
               )

df3 = df2.groupby('country').count().show()



+-------+------+
|country| count|
+-------+------+
|     cl| 58538|
|     ec| 72915|
|     co| 81799|
|     pe|100664|
|     ar| 39328|
|     bo| 11265|
|     do| 56025|
|     mx|272600|
+-------+------+



                                                                                

In [124]:
#Cantas valoracións con máis de 4.5 hai?

df2 = (df
               .where(df.rat_y > 4.5)
               .select(df.country)
               )
df2.count()

                                                                                

277251

In [137]:
#Imprime a media das valoracións en LA por aplicación, co total delas

df2 = (df
               .where(df.country != 'null')
               .select(df.appname,df.rat_y)
               )

df2.groupby('appname').count().show()
df2.groupby('appname').avg().show()

                                                                                

+--------------------+------+
|             appname| count|
+--------------------+------+
|         HSBC México|  3010|
|  WhatsApp Messenger| 61930|
|    Mi Movistar Perú|  1250|
|           Messenger|318360|
|    Candy Crush Saga| 16065|
|              TikTok| 33072|
|                BNA+|   503|
|         Onboard BdP|   247|
|           Instagram| 80443|
|     Starbucks Chile|   407|
|Crash Bandicoot: ...|   193|
|      Walmart México|  3238|
|                Yape|   891|
|Bancolombia A la ...|   717|
|     Puntos Colombia|    19|
|          Cuenta DNI|  1038|
|           Pinterest|  5793|
|ZOL FM Republica ...|    23|
|    Cinemark Bolivia|    23|
|             Netflix|  1922|
+--------------------+------+
only showing top 20 rows





+--------------------+------------------+
|             appname|        avg(rat_y)|
+--------------------+------------------+
|         HSBC México|2.2883720930232556|
|  WhatsApp Messenger|3.5629258840626514|
|    Mi Movistar Perú|            2.0144|
|           Messenger|2.8544101017715793|
|    Candy Crush Saga| 4.662060379707438|
|              TikTok| 4.766448959845186|
|                BNA+|3.6481113320079523|
|         Onboard BdP|2.7611336032388665|
|           Instagram|  3.59048021580498|
|     Starbucks Chile|3.3144963144963144|
|Crash Bandicoot: ...| 4.632124352331606|
|      Walmart México|3.4014823965410748|
|                Yape|2.8226711560044895|
|Bancolombia A la ...|1.8744769874476988|
|     Puntos Colombia|2.0526315789473686|
|          Cuenta DNI| 2.674373795761079|
|           Pinterest| 4.638701881581219|
|ZOL FM Republica ...|3.5652173913043477|
|    Cinemark Bolivia|3.3043478260869565|
|             Netflix|2.5416233090530698|
+--------------------+------------

                                                                                

In [138]:
#Compara a media das valoracións mundiales coas de Latinoamérica
from pyspark.sql.functions import avg

df.select(avg('rat_x')).show()


df2 = (df
               .where(df.country != 'null')
               .select(df.rat_x)
               )

df2.select(avg('rat_x')).show()


                                                                                

+------------------+
|        avg(rat_x)|
+------------------+
|2.5163021553947864|
+------------------+





+------------------+
|        avg(rat_x)|
+------------------+
|3.5123932847232804|
+------------------+



                                                                                

In [179]:
#As tres aplicacións máis comentadas
from pyspark.sql.functions import *

df2 = (df
               .where(df.country != 'null')
               .select(df.appname,df.review)
               )

df3 = df2.groupby('appname').count()
df4 = df3.sort(desc('count')).collect()

for i in range(0,3,1):
    print(df4[i])



Row(appname='Messenger', count=318360)
Row(appname='Instagram', count=80443)
Row(appname='WhatsApp Messenger', count=61930)


                                                                                