In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *


spark = SparkSession.builder.master('local[2]').getOrCreate()

In [47]:
# Create a DataFrame from JSON data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

# Read Sillicon valley episodes data
data_file="hdfs://hdfs-nn:9000/user/Projeto TABD/Bronze/KnowledgeOfWhichTypesOnOnlinePlatformsAreUsedToCyberBullyChildrenAccordingToParentsWorldwide.csv"
ParentAwarenessOfCyberBullyingPlatforms = spark.read.csv(data_file,header=True, sep=";")

In [48]:
ParentAwarenessOfCyberBullyingPlatforms.printSchema()
ParentAwarenessOfCyberBullyingPlatforms.show()
ParentAwarenessOfCyberBullyingPlatforms.toPandas()

root
 |-- _c0: string (nullable = true)
 |-- Social networks: string (nullable = true)
 |-- Mobile: string (nullable = true)
 |-- Online messaging: string (nullable = true)
 |-- Online chat room: string (nullable = true)
 |-- E-mail: string (nullable = true)
 |-- Other websites: string (nullable = true)
 |-- Other forms of technoloy: string (nullable = true)
 |-- _c8: string (nullable = true)

+--------------------+---------------+------+----------------+----------------+------+--------------+------------------------+----+
|                 _c0|Social networks|Mobile|Online messaging|Online chat room|E-mail|Other websites|Other forms of technoloy| _c8|
+--------------------+---------------+------+----------------+----------------+------+--------------+------------------------+----+
|                APAC|             53|    46|              48|              45|    29|            10|                       3|in %|
|              Europe|             65|    40|              33|             

Unnamed: 0,_c0,Social networks,Mobile,Online messaging,Online chat room,E-mail,Other websites,Other forms of technoloy,_c8
0,APAC,53,46,48,45,29,10,3,in %
1,Europe,65,40,33,29,13,9,4,in %
2,LATAM,76,50,37,35,17,23,11,in %
3,Middle East / Africa,61,42,36,33,18,13,6,in %
4,North America,67,41,41,29,23,8,3,in %


In [50]:
ParentAwarenessOfCyberBullyingPlatforms = ParentAwarenessOfCyberBullyingPlatforms.selectExpr("_c0 as region",
                                   "`Social networks` as Social_Networks", 
                                 "Mobile as Mobile", "`Online messaging` as Online_Messaging",
                                 "`Online chat room` as Online_chat_room",
                                 "`E-mail` as Email",  "`Other websites` as Other_Websites",
                                 "`Other forms of technoloy` as Other_forms_of_technoloy")
ParentAwarenessOfCyberBullyingPlatforms.toPandas()

Unnamed: 0,region,Social_Networks,Mobile,Online_Messaging,Online_chat_room,Email,Other_Websites,Other_forms_of_technoloy
0,APAC,53,46,48,45,29,10,3
1,Europe,65,40,33,29,13,9,4
2,LATAM,76,50,37,35,17,23,11
3,Middle East / Africa,61,42,36,33,18,13,6
4,North America,67,41,41,29,23,8,3


In [51]:
# Unpivot - Columns to rows
ParentAwarenessOfCyberBullyingPlatforms = ParentAwarenessOfCyberBullyingPlatforms.select(
        col("region"),
        expr("stack(7, 'Social_Networks', `Social_Networks`, 'Mobile', `Mobile`, 'Online_Messaging', `Online_Messaging`, 'Online_chat_room', `Online_chat_room`, 'Email', `Email`, 'Other_Websites', `Other_Websites`, 'Other_forms_of_technoloy', `Other_forms_of_technoloy`) as (platform,percentage_cyberbullying)")
)



ParentAwarenessOfCyberBullyingPlatforms.toPandas()

Unnamed: 0,region,platform,percentage_cyberbullying
0,APAC,Social_Networks,53
1,APAC,Mobile,46
2,APAC,Online_Messaging,48
3,APAC,Online_chat_room,45
4,APAC,Email,29
5,APAC,Other_Websites,10
6,APAC,Other_forms_of_technoloy,3
7,Europe,Social_Networks,65
8,Europe,Mobile,40
9,Europe,Online_Messaging,33


In [52]:
ParentAwarenessOfCyberBullyingPlatforms = ParentAwarenessOfCyberBullyingPlatforms.withColumn("percentage_cyberbullying", ParentAwarenessOfCyberBullyingPlatforms["percentage_cyberbullying"].cast(IntegerType()))


In [53]:
ParentAwarenessOfCyberBullyingPlatforms.printSchema()

root
 |-- region: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- percentage_cyberbullying: integer (nullable = true)



In [54]:
ParentAwarenessOfCyberBullyingPlatforms \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/P_ParentAwarenessOfCyberBullyingPlatforms/")