## Fuctions: select(), groupyBy()

## Types

In [18]:
#Initialize pyspark

import findspark
findspark.init()

import pyspark

In [19]:
#Create a SparkSession

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CSV to Dataset").getOrCreate()

In [20]:
#Create a DataFrame form csv file, csv() method of DatafRameReader

data = spark.read.options(header='True').csv('C:/Users/usuario/Documents/Blanca/Spark/data/airport_codes.csv')
data.printSchema()

root
 |-- ident: string (nullable = true)
 |-- category: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude_deg: string (nullable = true)
 |-- longitude_deg: string (nullable = true)
 |-- elevation_ft: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)



### select()

In [21]:
#Select a single column

data.select('name').show(10)

+--------------------+
|                name|
+--------------------+
|   Total Rf Heliport|
|        Lowell Field|
|        Epps Airpark|
|Newport Hospital ...|
|      Cordes Airport|
|Goldstone /Gts/ A...|
|          Cass Field|
| Grass Patch Airport|
|  Ringhaver Heliport|
|   River Oak Airport|
+--------------------+
only showing top 10 rows



In [22]:
#Select 2 or more columns

data.select('ident', 'name', 'category').show(10)

+-----+--------------------+-------------+
|ident|                name|     category|
+-----+--------------------+-------------+
|  00A|   Total Rf Heliport|     heliport|
| 00AK|        Lowell Field|small_airport|
| 00AL|        Epps Airpark|small_airport|
| 00AR|Newport Hospital ...|     heliport|
| 00AZ|      Cordes Airport|small_airport|
| 00CA|Goldstone /Gts/ A...|small_airport|
| 00CO|          Cass Field|small_airport|
| 00FA| Grass Patch Airport|small_airport|
| 00FD|  Ringhaver Heliport|     heliport|
| 00FL|   River Oak Airport|small_airport|
+-----+--------------------+-------------+
only showing top 10 rows



### groupBy()

In [8]:
#GroupBy

data.groupBy('category', 'iso_region').count().show(truncate=False)

+--------------+----------+-----+
|category      |iso_region|count|
+--------------+----------+-----+
|small_airport |BG-09     |1    |
|small_airport |BF-YAT    |2    |
|heliport      |NO-18     |1    |
|small_airport |SE-M      |9    |
|small_airport |LV-TA     |1    |
|small_airport |LT-TE     |2    |
|small_airport |ZW-MC     |7    |
|small_airport |GN-NZ     |1    |
|small_airport |EG-DK     |1    |
|large_airport |ID-KI     |1    |
|small_airport |IR-28     |2    |
|heliport      |JO-AZ     |1    |
|heliport      |ES-GA     |1    |
|small_airport |ES-PM     |2    |
|medium_airport|FR-Q      |3    |
|medium_airport|MX-BCS    |4    |
|small_airport |CR-SJ     |2    |
|small_airport |SV-SS     |2    |
|medium_airport|CU-02     |2    |
|small_airport |AF-FYB    |1    |
+--------------+----------+-----+
only showing top 20 rows



In [9]:
data.groupBy('category').count().orderBy('category').show(truncate=False)

+--------------+-----+
|category      |count|
+--------------+-----+
|balloonport   |17   |
|closed        |1429 |
|heliport      |8985 |
|large_airport |566  |
|medium_airport|4531 |
|seaplane_base |909  |
|small_airport |29798|
+--------------+-----+



### Change column type

In [13]:
#Column type

data.select('elevation_ft').dtypesColun 

[('elevation_ft', 'string')]

In [17]:
#String to Integer

from pyspark.sql.types import IntegerType

data_new = data.withColumn('elevation_ft',data['elevation_ft'].cast(IntegerType()))
data_new.printSchema()


root
 |-- ident: string (nullable = true)
 |-- category: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude_deg: string (nullable = true)
 |-- longitude_deg: string (nullable = true)
 |-- elevation_ft: integer (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)

