In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

spark = SparkSession.builder.getOrCreate()
eqDF = spark.read.option('header','true')\
                 .option('delimiter','\t')\
                 .option('inferSchema','true')\
                 .csv('datasets/EarthquakeDataset-latest.txt')
eqDF = eqDF.select('Date','Origin Time','Latitude','Longitude','xM','Location')
eqDF.show()

+----------+-----------+--------+---------+---+--------------------+
|      Date|Origin Time|Latitude|Longitude| xM|            Location|
+----------+-----------+--------+---------+---+--------------------+
|2022.02.28|09:44:21.35|  37.875|  26.9258|3.6|KUSADASI KORFEZI ...|
|2022.02.28|06:38:19.10|  35.592|  26.3002|4.1|             AKDENIZ|
|2022.02.27|05:04:26.08| 35.6017|    26.28|4.2|             AKDENIZ|
|2022.02.26|12:44:18.96| 35.0165|  26.8602|3.7|GIRIT ADASI ACIKL...|
|2022.02.25|01:03:59.78|  37.077|  28.5588|3.5|BALCILAR-KOYCEGIZ...|
|2022.02.24|14:50:35.13|  37.069|  28.3785|3.9|GOKOVA-ULA (MUGLA...|
|2022.02.24|09:51:32.59|  41.148|   34.438|3.8|HALILAR-KARGI (CO...|
|2022.02.20|20:20:10.99|  40.728|  27.4025|4.3|GUZELKOY ACIKLARI...|
|2022.02.20|19:47:40.71|  36.226|  27.4677|3.8|DATCA ACIKLARI-MU...|
|2022.02.19|07:29:38.26| 35.6007|  26.2162|3.9|             AKDENIZ|
|2022.02.19|04:21:05.49| 38.4295|  40.7145|3.6|SIGINAK-LICE (DIY...|
|2022.02.13|18:28:45.92| 41.0983| 

In [None]:
eqDF.createOrReplaceTempView('earthquake')

bigEarthquakesDF = spark.sql('select * from earthquake where xM>=7')
bigEarthquakesDF.show()

+----------+-----------+--------+---------+---+--------------------+
|      Date|Origin Time|Latitude|Longitude| xM|            Location|
+----------+-----------+--------+---------+---+--------------------+
|2011.10.23|10:41:21.01| 38.7212|   43.411|7.2|YEMLICE- (VAN) [N...|
|1999.11.12|16:57:20.80|   40.74|    31.21|7.2|UGUR- (DUZCE) [No...|
|1999.08.17|00:01:37.60|   40.76|    29.97|7.4|BASISKELE (KOCAEL...|
|1976.11.24|12:22:16.00|   39.05|    44.04|7.5|YENIYAKA-CALDIRAN...|
|1970.03.28|21:02:23.50|   39.21|    29.51|7.0|KIZIK-CAVDARHISAR...|
|1964.10.06|14:31:23.00|    40.3|    28.23|7.0|OKCULAR-KARACABEY...|
|1957.05.26|06:33:35.10|   40.67|     31.0|7.1|GUZELDERE-GOLYAKA...|
|1957.04.25|02:25:45.10|   36.42|    28.68|7.1|             AKDENIZ|
|1953.03.18|19:06:16.10|   39.99|    27.36|7.2|SOGUCAK-YENICE (Ç...|
|1948.02.09|12:58:18.60|   35.41|     27.2|7.2|             AKDENIZ|
|1944.02.01|03:22:39.90| 40.7871|  31.8723|7.2|    ULUMESCIT-(BOLU)|
|1943.11.26|22:20:40.80|   41.05| 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

spark = SparkSession.builder.getOrCreate()
worldDF = spark.read.option('inferSchema','true')\
                 .csv('datasets/world.txt')
worldDF = worldDF.withColumnRenamed('_c0','Country')
worldDF = worldDF.withColumnRenamed('_c1','Continent')
worldDF = worldDF.withColumnRenamed('_c2','Capital')
worldDF = worldDF.withColumnRenamed('_c3','Population')
worldDF.createOrReplaceTempView('world')
worldDF.show()

+-------------------+-------------+----------------+----------+
|            Country|    Continent|         Capital|Population|
+-------------------+-------------+----------------+----------+
|        Afghanistan|         Asia|           Kabul|  29863010|
|            Albania|       Europe|          Tirana|   3129678|
|            Algeria|       Africa|         Algiers|  32853800|
|            Andorra|       Europe|Andorra la Vella|     67151|
|             Angola|       Africa|          Luanda|  15941390|
|Antigua and Barbuda|North America|      St. John's|     81479|
|          Argentina|South America|    Buenos Aires|  38747150|
|            Armenia|         Asia|         Yerevan|   3016312|
|          Australia|       Oceana|        Canberra|  20155130|
|            Austria|       Europe|          Vienna|   8189444|
|         Azerbaijan|         Asia|            Baku|   8410801|
|            Bahamas|North America|          Nassau|    323063|
|            Bahrain|         Asia|     

In [None]:
continentDf = spark.sql('select Continent,sum(Population) as Total from world group by Continent')
continentDf.show()

+-------------+----------+
|    Continent|     Total|
+-------------+----------+
|       Europe| 662292883|
|       Africa| 908406750|
|North America| 511265886|
|South America| 374997087|
|       Oceana|  32018861|
|         Asia|3982554409|
+-------------+----------+



In [None]:
spark.\
    sql('select Continent,count(*) as NumCountries from world group by Continent').\
    sort('NumCountries',ascending=False).show()

+-------------+------------+
|    Continent|NumCountries|
+-------------+------------+
|       Africa|          55|
|         Asia|          49|
|       Europe|          46|
|North America|          23|
|       Oceana|          13|
|South America|          12|
+-------------+------------+



In [None]:
spark.\
    sql('select Continent,count(*) as NumCountries from world group by Continent order by NumCountries desc').show()

+-------------+------------+
|    Continent|NumCountries|
+-------------+------------+
|       Africa|          55|
|         Asia|          49|
|       Europe|          46|
|North America|          23|
|       Oceana|          13|
|South America|          12|
+-------------+------------+

