# PySpark: Basic Functions Guide

## 1. Schemas

In [0]:
from pyspark.sql.types import StructType,StructField ,StringType, IntegerType
data = [("pepe","","fernandez","36636","M",3000),
        ("juanjo","tebar","","40288","M",4000),
        ("sara","","jimenez","42114","M",4000),
        ("Maria","del Carmen","ramirez","39192","F",4000),
        ("jose","Maria","perez","","F",-1)
       ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

## 2. Functions

In [0]:
dfSpark = spark.read.load("/FileStore/tables/ejercicio1.csv", format="csv", sep=",",inferSchema ="True", header="true")
display(dfSpark.printSchema())

In [0]:
dfSpark.count(), len(dfSpark.columns)

In [0]:
display(dfSpark.select(["enrollee_id","city"]))

enrollee_id,city
32403,city_41
9858,city_103
31806,city_21
27385,city_13
27724,city_103
217,city_23
21465,city_21
27302,city_160
12994,city_173
16287,city_21


In [0]:
display(dfSpark.filter(dfSpark['training_hours']>3))

enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
32403,city_41,0.8270000000000001,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
27385,city_13,0.8270000000000001,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
27724,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72
217,city_23,0.899,Male,No relevent experience,Part time course,Masters,STEM,10,,,2,12
21465,city_21,0.624,,Has relevent experience,no_enrollment,Graduate,STEM,<1,100-500,Pvt Ltd,1,11
27302,city_160,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,>4,81
16287,city_21,0.624,Male,Has relevent experience,Full time course,Graduate,,3,50-99,Funded Startup,1,4
10856,city_103,0.92,Male,Has relevent experience,no_enrollment,Masters,Other,>20,,,>4,196


In [0]:
display(dfSpark.groupBy("gender").count())

gender,count
,508
Female,137
Other,24
Male,1460


In [0]:
display(dfSpark.groupBy("gender").avg("training_hours"))

gender,avg(training_hours)
,62.79330708661417
Female,61.76642335766423
Other,80.66666666666667
Male,65.78904109589041


In [0]:
display(dfSpark.orderBy(dfSpark["gender"].desc()))

enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
15158,city_16,0.91,Other,Has relevent experience,no_enrollment,Graduate,Humanities,5,1000-4999,Pvt Ltd,>4,134
20910,city_71,0.884,Other,No relevent experience,Full time course,Graduate,STEM,2,,,1,68
17504,city_160,0.92,Other,No relevent experience,no_enrollment,Graduate,STEM,<1,,,never,3
27425,city_75,0.939,Other,Has relevent experience,no_enrollment,Graduate,STEM,16,,,never,34
5605,city_76,0.698,Other,Has relevent experience,no_enrollment,Graduate,STEM,9,10000+,Public Sector,3,194
4063,city_16,0.91,Other,No relevent experience,no_enrollment,Masters,STEM,>20,500-999,Other,>4,148
146,city_103,0.92,Other,Has relevent experience,no_enrollment,Graduate,STEM,9,50-99,Pvt Ltd,2,4
26576,city_103,0.92,Other,No relevent experience,no_enrollment,Graduate,STEM,4,500-999,Pvt Ltd,3,122
7148,city_160,0.92,Other,No relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Public Sector,>4,30
28251,city_165,0.903,Other,Has relevent experience,no_enrollment,Graduate,Humanities,3,<10,Pvt Ltd,1,33


In [0]:
from pyspark.sql.functions import countDistinct
dfSpark.select( countDistinct("city").alias("ciudad") ).show()

In [0]:
display(dfSpark.withColumn( "training_hours_doble" , dfSpark['training_hours']*2))

enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,training_hours_doble
32403,city_41,0.8270000000000001,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21,42
9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98,196
31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15,30
27385,city_13,0.8270000000000001,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39,78
27724,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72,144
217,city_23,0.899,Male,No relevent experience,Part time course,Masters,STEM,10,,,2,12,24
21465,city_21,0.624,,Has relevent experience,no_enrollment,Graduate,STEM,<1,100-500,Pvt Ltd,1,11,22
27302,city_160,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,>4,81,162
12994,city_173,0.878,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,2,4
16287,city_21,0.624,Male,Has relevent experience,Full time course,Graduate,,3,50-99,Funded Startup,1,4,8
