In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName('GDP').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/09 10:39:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv('countries of the world.csv', inferSchema = True, header=True)

In [4]:
df.show(5)

+---------------+--------------------+----------+--------------+--------------------------+----------------------------+-------------+----------------------------------+------------------+------------+-----------------+----------+---------+---------+-------+---------+---------+-----------+--------+-------+
|        Country|              Region|Population|Area (sq. mi.)|Pop. Density (per sq. mi.)|Coastline (coast/area ratio)|Net migration|Infant mortality (per 1000 births)|GDP ($ per capita)|Literacy (%)|Phones (per 1000)|Arable (%)|Crops (%)|Other (%)|Climate|Birthrate|Deathrate|Agriculture|Industry|Service|
+---------------+--------------------+----------+--------------+--------------------------+----------------------------+-------------+----------------------------------+------------------+------------+-----------------+----------+---------+---------+-------+---------+---------+-----------+--------+-------+
|   Afghanistan |ASIA (EX. NEAR EA...|  31056997|        647500|            

In [5]:
# Schema and basic details
df.printSchema()
df.describe().show()

root
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Population: integer (nullable = true)
 |-- Area (sq. mi.): integer (nullable = true)
 |-- Pop. Density (per sq. mi.): string (nullable = true)
 |-- Coastline (coast/area ratio): string (nullable = true)
 |-- Net migration: string (nullable = true)
 |-- Infant mortality (per 1000 births): string (nullable = true)
 |-- GDP ($ per capita): integer (nullable = true)
 |-- Literacy (%): string (nullable = true)
 |-- Phones (per 1000): string (nullable = true)
 |-- Arable (%): string (nullable = true)
 |-- Crops (%): string (nullable = true)
 |-- Other (%): string (nullable = true)
 |-- Climate: string (nullable = true)
 |-- Birthrate: string (nullable = true)
 |-- Deathrate: string (nullable = true)
 |-- Agriculture: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Service: string (nullable = true)



                                                                                

+-------+------------+--------------------+--------------------+------------------+--------------------------+----------------------------+--------------------+----------------------------------+------------------+------------+-----------------+------------------+------------------+------------------+------------------+------------------+---------+-----------+--------+-------+
|summary|     Country|              Region|          Population|    Area (sq. mi.)|Pop. Density (per sq. mi.)|Coastline (coast/area ratio)|       Net migration|Infant mortality (per 1000 births)|GDP ($ per capita)|Literacy (%)|Phones (per 1000)|        Arable (%)|         Crops (%)|         Other (%)|           Climate|         Birthrate|Deathrate|Agriculture|Industry|Service|
+-------+------------+--------------------+--------------------+------------------+--------------------------+----------------------------+--------------------+----------------------------------+------------------+------------+-------------

In [6]:
df.na.drop(subset='Phones (per 1000)').count()
df.count()

227

In [7]:
df.select(['Country', 'Phones (per 1000)']).show(5)

+---------------+-----------------+
|        Country|Phones (per 1000)|
+---------------+-----------------+
|   Afghanistan |              3,2|
|       Albania |             71,2|
|       Algeria |             78,1|
|American Samoa |            259,5|
|       Andorra |            497,2|
+---------------+-----------------+
only showing top 5 rows



In [8]:
# Select the distinct Region
df.select(['Region']).distinct().show()

+--------------------+
|              Region|
+--------------------+
|BALTICS          ...|
|C.W. OF IND. STATES |
|ASIA (EX. NEAR EA...|
|WESTERN EUROPE   ...|
|NORTHERN AMERICA ...|
|NEAR EAST        ...|
|EASTERN EUROPE   ...|
|OCEANIA          ...|
|SUB-SAHARAN AFRIC...|
|NORTHERN AFRICA  ...|
|LATIN AMER. & CAR...|
+--------------------+



In [9]:
# Sum of Population by Region
df.groupBy('Region').count().show()

+--------------------+-----+
|              Region|count|
+--------------------+-----+
|BALTICS          ...|    3|
|C.W. OF IND. STATES |   12|
|ASIA (EX. NEAR EA...|   28|
|WESTERN EUROPE   ...|   28|
|NORTHERN AMERICA ...|    5|
|NEAR EAST        ...|   16|
|EASTERN EUROPE   ...|   12|
|OCEANIA          ...|   21|
|SUB-SAHARAN AFRIC...|   51|
|NORTHERN AFRICA  ...|    6|
|LATIN AMER. & CAR...|   45|
+--------------------+-----+



In [10]:
groupCols = ['Region']
gdpByRegion = df.groupBy(groupCols).agg(F.sum('Population').alias('Total population'), 
                                        F.avg('Population').alias('Average Population'), 
                                        F.avg('GDP ($ per capita)').alias('Avg Gdp per capita'))
df2 = gdpByRegion.orderBy(['Avg Gdp per capita', 'Total population'], ascending=[0, 1])
df2.show()

+--------------------+----------------+--------------------+------------------+
|              Region|Total population|  Average Population|Avg Gdp per capita|
+--------------------+----------------+--------------------+------------------+
|WESTERN EUROPE   ...|       396339998| 1.415499992857143E7|27046.428571428572|
|NORTHERN AMERICA ...|       331672307|        6.63344614E7|           26100.0|
|BALTICS          ...|         7184974|  2394991.3333333335|           11300.0|
|NEAR EAST        ...|       195068377|     1.21917735625E7|          10456.25|
|EASTERN EUROPE   ...|       119914717|   9992893.083333334| 9808.333333333334|
|LATIN AMER. & CAR...|       561824599| 1.248499108888889E7| 8682.222222222223|
|OCEANIA          ...|        33131662|  1577698.1904761905| 8247.619047619048|
|ASIA (EX. NEAR EA...|      3687982236|1.3171365128571428E8| 8053.571428571428|
|NORTHERN AFRICA  ...|       161407133|2.6901188833333332E7|            5460.0|
|C.W. OF IND. STATES |       280081548| 

In [19]:

# df3 = df2.withColumn("lit_value2", when(col("Salary") >=40000 & col("Salary") <= 50000,lit("100")).otherwise(lit("200")))
# df3.show(truncate=False)


avg_world_population = df.agg(F.mean('Population')).collect()[0][0]

df2.withColumn('If_col', F.when(F.col('Total Population') > avg_world_population, F.lit('More than World Average')).otherwise(F.lit('Less than world average'))).show()

+--------------------+----------------+--------------------+------------------+--------------------+
|              Region|Total population|  Average Population|Avg Gdp per capita|              If_col|
+--------------------+----------------+--------------------+------------------+--------------------+
|WESTERN EUROPE   ...|       396339998| 1.415499992857143E7|27046.428571428572|More than World A...|
|NORTHERN AMERICA ...|       331672307|        6.63344614E7|           26100.0|More than World A...|
|BALTICS          ...|         7184974|  2394991.3333333335|           11300.0|Less than world a...|
|NEAR EAST        ...|       195068377|     1.21917735625E7|          10456.25|More than World A...|
|EASTERN EUROPE   ...|       119914717|   9992893.083333334| 9808.333333333334|More than World A...|
|LATIN AMER. & CAR...|       561824599| 1.248499108888889E7| 8682.222222222223|More than World A...|
|OCEANIA          ...|        33131662|  1577698.1904761905| 8247.619047619048|More than Wo