**Instalar sessão**

In [4]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [5]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

**Coding**

In [9]:
# iniciar uma sessão local
from pyspark.sql import SparkSession
sc = SparkSession.builder.master('local[*]').getOrCreate()

In [12]:
# carregar dados de um csv
df_spark = sc.read.csv("/content/worldcities.csv")

# ver algumas informações sobre os tipos de dados de cada coluna
df_spark.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)



In [13]:
df_spark.show()

+------------+------------+--------+--------+-------------+----+----+--------------------+-------+----------+----------+
|         _c0|         _c1|     _c2|     _c3|          _c4| _c5| _c6|                 _c7|    _c8|       _c9|      _c10|
+------------+------------+--------+--------+-------------+----+----+--------------------+-------+----------+----------+
|        city|  city_ascii|     lat|     lng|      country|iso2|iso3|          admin_name|capital|population|        id|
|       Tokyo|       Tokyo| 35.6897|139.6922|        Japan|  JP| JPN|               Tōkyō|primary|  37977000|1392685764|
|     Jakarta|     Jakarta| -6.2146|106.8451|    Indonesia|  ID| IDN|             Jakarta|primary|  34540000|1360771077|
|       Delhi|       Delhi| 28.6600| 77.2300|        India|  IN| IND|               Delhi|  admin|  29617000|1356872604|
|      Mumbai|      Mumbai| 18.9667| 72.8333|        India|  IN| IND|         Mahārāshtra|  admin|  23355000|1356226629|
|      Manila|      Manila| 14.6

Usando header para setar os nomes das colunas corretamente

In [14]:
df_spark1 = sc.read.csv("/content/worldcities.csv", header=True)

In [15]:
df_spark1.show()

+------------+------------+--------+--------+-------------+----+----+--------------------+-------+----------+----------+
|        city|  city_ascii|     lat|     lng|      country|iso2|iso3|          admin_name|capital|population|        id|
+------------+------------+--------+--------+-------------+----+----+--------------------+-------+----------+----------+
|       Tokyo|       Tokyo| 35.6897|139.6922|        Japan|  JP| JPN|               Tōkyō|primary|  37977000|1392685764|
|     Jakarta|     Jakarta| -6.2146|106.8451|    Indonesia|  ID| IDN|             Jakarta|primary|  34540000|1360771077|
|       Delhi|       Delhi| 28.6600| 77.2300|        India|  IN| IND|               Delhi|  admin|  29617000|1356872604|
|      Mumbai|      Mumbai| 18.9667| 72.8333|        India|  IN| IND|         Mahārāshtra|  admin|  23355000|1356226629|
|      Manila|      Manila| 14.6000|120.9833|  Philippines|  PH| PHL|              Manila|primary|  23088000|1608618140|
|    Shanghai|    Shanghai| 31.1

In [19]:
df_spark1

DataFrame[city: string, city_ascii: string, lat: string, lng: string, country: string, iso2: string, iso3: string, admin_name: string, capital: string, population: string, id: string]

Como poder ver acima, todas as colunas são do tipo string, vamos ajeitar usando inferSchema

In [18]:
df_spark2 = sc.read.csv("/content/worldcities.csv", inferSchema=True, header=True) 

In [20]:
df_spark2

DataFrame[city: string, city_ascii: string, lat: double, lng: double, country: string, iso2: string, iso3: string, admin_name: string, capital: string, population: double, id: int]

In [17]:
df_spark2.show()

+------------+------------+--------+--------+-------------+----+----+--------------------+-------+----------+----------+
|        city|  city_ascii|     lat|     lng|      country|iso2|iso3|          admin_name|capital|population|        id|
+------------+------------+--------+--------+-------------+----+----+--------------------+-------+----------+----------+
|       Tokyo|       Tokyo| 35.6897|139.6922|        Japan|  JP| JPN|               Tōkyō|primary|  3.7977E7|1392685764|
|     Jakarta|     Jakarta| -6.2146|106.8451|    Indonesia|  ID| IDN|             Jakarta|primary|   3.454E7|1360771077|
|       Delhi|       Delhi|   28.66|   77.23|        India|  IN| IND|               Delhi|  admin|  2.9617E7|1356872604|
|      Mumbai|      Mumbai| 18.9667| 72.8333|        India|  IN| IND|         Mahārāshtra|  admin|  2.3355E7|1356226629|
|      Manila|      Manila|    14.6|120.9833|  Philippines|  PH| PHL|              Manila|primary|  2.3088E7|1608618140|
|    Shanghai|    Shanghai| 31.1

In [22]:
df_spark2.printSchema()

root
 |-- city: string (nullable = true)
 |-- city_ascii: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- country: string (nullable = true)
 |-- iso2: string (nullable = true)
 |-- iso3: string (nullable = true)
 |-- admin_name: string (nullable = true)
 |-- capital: string (nullable = true)
 |-- population: double (nullable = true)
 |-- id: integer (nullable = true)



In [21]:
type(df_spark2)

pyspark.sql.dataframe.DataFrame