In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.master('local[2]').getOrCreate()

In [2]:


socialnetworkusersinselectedcountries = spark.read.csv("hdfs://hdfs-nn:9000/user/Projeto TABD/Bronze/SocialNetworkUsersInSelectedCountriesIn2020And2025.csv", 
                                                 header=True, 
                                                 sep=";")


In [3]:
socialnetworkusersinselectedcountries.printSchema()
socialnetworkusersinselectedcountries.show()
socialnetworkusersinselectedcountries.toPandas()

root
 |-- Countries: string (nullable = true)
 |-- 2020: string (nullable = true)
 |-- 2025*: string (nullable = true)

+----------------+------+--------+
|       Countries|  2020|   2025*|
+----------------+------+--------+
|China (mainland)|926,84|1 135,13|
|           India|349,97|   490,3|
|       Indonesia|198,96|  256,11|
|   United States|223,03|  243,42|
|          Brazil|141,45|  157,85|
|      Bangladesh| 58,77|    99,3|
|          Mexico| 80,88|   95,22|
|         Vietnam| 73,56|   93,68|
|     Philippines| 79,58|   90,04|
|           Japan| 86,06|   89,08|
|          Russia| 72,81|   75,48|
|          Turkey| 54,34|   71,44|
|        Thailand| 52,72|   61,77|
|  United Kingdom| 48,63|   50,89|
|     South Korea| 44,47|   45,53|
|         Germany| 44,48|   45,41|
|         Nigeria| 28,15|   44,63|
|          France| 36,92|   38,51|
|           Italy| 34,02|   36,09|
|          Canada| 25,35|   32,07|
+----------------+------+--------+



Unnamed: 0,Countries,2020,2025*
0,China (mainland),92684,"1 135,13"
1,India,34997,4903
2,Indonesia,19896,25611
3,United States,22303,24342
4,Brazil,14145,15785
5,Bangladesh,5877,993
6,Mexico,8088,9522
7,Vietnam,7356,9368
8,Philippines,7958,9004
9,Japan,8606,8908


In [4]:
socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.selectExpr("Countries as country", 
                                                                             "`2020` as users2020",
                                                                             "`2025*` as users2025")

In [5]:
socialnetworkusersinselectedcountries.toPandas()

Unnamed: 0,country,users2020,users2025
0,China (mainland),92684,"1 135,13"
1,India,34997,4903
2,Indonesia,19896,25611
3,United States,22303,24342
4,Brazil,14145,15785
5,Bangladesh,5877,993
6,Mexico,8088,9522
7,Vietnam,7356,9368
8,Philippines,7958,9004
9,Japan,8606,8908


In [6]:
# Unpivot - Columns to rows
socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.select(
        col("country"),
        expr("stack(2, 'users2020', `users2020`, 'users2025', `users2025`) as (year,social_media_users)")
)

In [7]:
socialnetworkusersinselectedcountries.toPandas()

Unnamed: 0,country,year,social_media_users
0,China (mainland),users2020,92684
1,China (mainland),users2025,"1 135,13"
2,India,users2020,34997
3,India,users2025,4903
4,Indonesia,users2020,19896
5,Indonesia,users2025,25611
6,United States,users2020,22303
7,United States,users2025,24342
8,Brazil,users2020,14145
9,Brazil,users2025,15785


In [8]:
socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.withColumn(
    "year",
    f.regexp_replace(col("year"),"users2020", "2020"))

socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.withColumn(
    "year",
    f.regexp_replace(col("year"),"users2025", "2025"))

socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.withColumn(
    "year",
    socialnetworkusersinselectedcountries["year"].cast(IntegerType()))

socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.withColumn(
    "social_media_users",
    f.regexp_replace(col("social_media_users")," ", ""))


socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.withColumn(
    "social_media_users",
    f.regexp_replace(col("social_media_users"),",", "."))

socialnetworkusersinselectedcountries = socialnetworkusersinselectedcountries.withColumn(
    "social_media_users",
    socialnetworkusersinselectedcountries["social_media_users"].cast(FloatType()))


In [9]:
socialnetworkusersinselectedcountries.toPandas()

Unnamed: 0,country,year,social_media_users
0,China (mainland),2020,926.840027
1,China (mainland),2025,1135.130005
2,India,2020,349.970001
3,India,2025,490.299988
4,Indonesia,2020,198.960007
5,Indonesia,2025,256.109985
6,United States,2020,223.029999
7,United States,2025,243.419998
8,Brazil,2020,141.449997
9,Brazil,2025,157.850006


In [10]:
socialnetworkusersinselectedcountries.printSchema()

root
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- social_media_users: float (nullable = true)



In [11]:
socialnetworkusersinselectedcountries \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/SocialNetworkUsersInSelectedCountriesIn2020And2025/")