In [1]:
import os
import findspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import Column as c
from pyspark.sql.window import Window


findspark.init()
findspark.find()

os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"

# Start session
http://localhost:4040

In [2]:
sp = SparkSession.builder.master('local[*]').config("spark.driver.memory", "15g").appName('spark_test_tables').getOrCreate()

In [3]:
sp.conf.set("spark.sql.adaptive.enabled",True)
sp.conf.set("spark.sql.adaptive.skewJoin.enabled",True)
sp.conf.set("spark.sql.adaptive.coalescePartitions.enabled",True)

# Preprocessing
Create spark datasets from all CSV and JSON files

In [4]:
# Open CSV file in spark dataframe
def spark_csv_open(file):
    path = "data/" + file
    
    # open file
    df = sp.read.format("csv") \
            .option("mode", "FAILFAST") \
            .option("inferSchema", "true") \
            .option("header","true") \
            .option("escapeQuotes", "true") \
            .option("path", path) \
            .load()
    
    # return spark dataframe
    return df

# Open JSON file with format {key:value} in pandas dataframe (key = 'country_code'), transform to spark dataframe
def spark_json_open(file,colname):
    path = "data/" + file
    
    # open file - pandas dataframe
    sf = pd.read_json(path, typ='series')
    df = pd.DataFrame({'country_code':sf.index, colname:sf.values})

    # create spark dataframe from pandas dataframe
    df = sp.createDataFrame(df)
    
    # return spark dataframe
    return df


# Create spark datasets from files
column1 = 'country_code'
df_nobel_orig = spark_csv_open('nobel_upd.csv')
df_countries_orig = spark_csv_open('countries of the world.csv')
df_cities_orig = spark_csv_open('worldcitiespop.csv')
df_continents_orig = spark_json_open('continent.json','continent_code')
df_names_orig = spark_json_open('names.json','country_name')

Use the result of analysis done for the 'snowflake' dataframes.<br>
Prepare Countries, Cities, Organizations information.

In [5]:
# Count of records in the original Nobel Laureates dataset
df_nobel_orig.count()

969

## Countries
Update country names for join country information

In [6]:
# Fix the names in the Countries dataset
df_countries_orig = df_countries_orig.withColumn("Country", \
                                   f.when(f.trim(f.col("Country")) == 'Congo, Repub. of the', 'Republic of the Congo') \
                                    .when(f.trim(f.col("Country")) == 'Korea, North', 'North Korea') \
                                    .when(f.trim(f.col("Country")) == 'Congo, Dem. Rep.', 'Democratic Republic of the Congo') \
                                    .when(f.trim(f.col("Country")) == 'Burma', 'Myanmar') \
                                    .when(f.trim(f.col("Country")) == 'Central African Rep.', 'Central African Republic') \
                                    .when(f.trim(f.col("Country")) == 'Macau', 'Macao') \
                                    .when(f.trim(f.col("Country")) == 'Micronesia, Fed. St.', 'Micronesia') \
                                    .when(f.trim(f.col("Country")) == 'Virgin Islands', 'U.S. Virgin Islands') \
                                    .when(f.trim(f.col("Country")) == 'N. Mariana Islands', 'Northern Mariana Islands') \
                                    .when(f.trim(f.col("Country")) == 'Gambia, The', 'Gambia') \
                                    .when(f.trim(f.col("Country")) == 'Cote d\'Ivoire', 'Ivory Coast') \
                                    .when(f.trim(f.col("Country")) == 'St Pierre & Miquelon', 'Saint Pierre and Miquelon') \
                                    .when(f.trim(f.col("Country")) == 'British Virgin Is.', 'British Virgin Islands') \
                                    .when(f.trim(f.col("Country")) == 'Korea, South', 'South Korea') \
                                    .when(f.trim(f.col("Country")) == 'Bahamas, The', 'Bahamas') \
                                    .when(f.trim(f.col("Country")) == 'Trinidad & Tobago', 'Trinidad and Tobago') \
                                    .when(f.trim(f.col("Country")) == 'Bosnia & Herzegovina', 'Bosnia and Herzegovina') \
                                    .when(f.trim(f.col("Country")) == 'Antigua & Barbuda', 'Antigua and Barbuda') \
                                    .when(f.trim(f.col("Country")) == 'Sao Tome & Principe', 'Sao Tome and Principe') \
                                    .when(f.trim(f.col("Country")) == 'Saint Kitts & Nevis', 'Saint Kitts and Nevis') \
                                    .when(f.trim(f.col("Country")) == 'Turks & Caicos Is', 'Turks and Caicos Islands') \
                                    .when(f.trim(f.col("Country")) == 'United States', 'United States of America') \
                                    .when(f.trim(f.col("Country")) == 'Guadeloupe', 'Guadeloupe Island') \
                                    .otherwise(f.col("Country")))

# Fix the names in the ISO2 dataset
df_names_orig = df_names_orig.withColumn("country_name", \
                                        f.when(f.trim(f.col("country_name")) == 'United States', 'United States of America') \
                                         .when(f.trim(f.col("country_name")) == 'Guadeloupe', 'Guadeloupe Island') \
                                         .otherwise(f.col("country_name")))

# Join ISO2, Countries and Continents dataset - for adding continents
join_expr = f.trim(df_countries_orig["Country"]) == f.trim(df_names_orig["country_name"])
df_countries_orig = df_countries_orig.join(df_names_orig, join_expr, "left_outer") \
                                     .withColumnRenamed("country_code", "cc") \
                                     .drop("country_name")

join_expr = f.trim(df_countries_orig["cc"]) == f.trim(df_continents_orig["country_code"])
df_countries_orig = df_countries_orig.join(df_continents_orig, join_expr, "left_outer") \
                                     .withColumn("continent_name",  f.when(f.col("continent_code") == 'NA', 'NORTH AMERICA') \
                                        .when(f.col("continent_code") == 'SA', 'SOUTH AMERICA') \
                                        .when(f.col("continent_code") == 'AS', 'ASIA') \
                                        .when(f.col("continent_code") == 'AN', 'ANTARCTICA') \
                                        .when(f.col("continent_code") == 'OC', 'AUSTRALIA/OCEANIA') \
                                        .when(f.col("continent_code") == 'EU', 'EUROPE') \
                                        .when(f.col("continent_code") == 'AF', 'AFRICA') \
                                        .otherwise('')) \
                                     .withColumn("country_code", f.lower(f.col("country_code"))) \
                                     .drop("continent_code", "cc")

# Fix Country names in the Nobel Prizes dataset
df_nobel_orig = df_nobel_orig.withColumn("Birth Country", \
                                 f.when(f.col("Birth Country") == 'W&uuml;rttemberg (Germany)', 'Württemberg (Germany)') \
                                  .otherwise(f.col("Birth Country"))) \
                             .withColumn("Organization Country", \
                                 f.when(f.col("Organization Country") == 'United States', 'United States of America') \
                                  .otherwise(f.col("Organization Country")))

## Parent Countries
Create a dataframe with matches 'Country Name' - 'Parent Country Name'.<br>
If a city changed a country, specify the historical country.

In [7]:
data = [{"c_name": "British Protectorate of Palestine (Israel)", "p_c_name": "Israel"},
{"c_name": "Czechoslovakia (Czech Republic)", "p_c_name": "Czech Republic"},
{"c_name": "Russian Empire (Latvia)", "p_c_name": "Latvia"},
{"c_name": "Trinidad", "p_c_name": "Trinidad and Tobago"},
{"c_name": "Austria-Hungary (Poland)", "p_c_name": "Poland"},
{"c_name": "Schleswig (Germany)", "p_c_name": "Germany"},
{"c_name": "Russian Empire (Poland)", "p_c_name": "Poland"},
{"c_name": "British India (India)", "p_c_name": "India"},
{"c_name": "British West Indies (Saint Lucia)", "p_c_name": "Saint Lucia"},
{"c_name": "Tuscany (Italy)", "p_c_name": "Italy"},
{"c_name": "Bosnia (Bosnia and Herzegovina)", "p_c_name": "Bosnia and Herzegovina"},
{"c_name": "Austria-Hungary (Ukraine)", "p_c_name": "Ukraine"},
{"c_name": "Russian Empire (Lithuania)", "p_c_name": "Lithuania"},
{"c_name": "Austria-Hungary (Slovenia)", "p_c_name": "Slovenia"},
{"c_name": "Austria-Hungary (Hungary)", "p_c_name": "Hungary"},
{"c_name": "German-occupied Poland (Poland)", "p_c_name": "Poland"},
{"c_name": "Austria-Hungary (Bosnia and Herzegovina)", "p_c_name": "Bosnia and Herzegovina"},
{"c_name": "Faroe Islands (Denmark)", "p_c_name": "Denmark"},
{"c_name": "Austrian Empire (Czech Republic)", "p_c_name": "Czech Republic"},
{"c_name": "Korea (South Korea)", "p_c_name": "South Korea"},
{"c_name": "French Algeria (Algeria)", "p_c_name": "Algeria"},
{"c_name": "Ottoman Empire (Turkey)", "p_c_name": "Turkey"},
{"c_name": "West Germany (Germany)", "p_c_name": "Germany"},
{"c_name": "Austrian Empire (Italy)", "p_c_name": "Italy"},
{"c_name": "Prussia (Germany)", "p_c_name": "Germany"},
{"c_name": "Russian Empire (Ukraine)", "p_c_name": "Ukraine"},
{"c_name": "British Mandate of Palestine (Israel)", "p_c_name": "Israel"},
{"c_name": "Mecklenburg (Germany)", "p_c_name": "Germany"},
{"c_name": "Persia (Iran)", "p_c_name": "Iran"},
{"c_name": "Crete (Greece)", "p_c_name": "Greece"},
{"c_name": "Gold Coast (Ghana)", "p_c_name": "Ghana"},
{"c_name": "Tibet (People's Republic of China)", "p_c_name": "China"},
{"c_name": "Austria-Hungary (Czech Republic)", "p_c_name": "Czech Republic"},
{"c_name": "Prussia (Poland)", "p_c_name": "Poland"},
{"c_name": "Russian Empire (Belarus)", "p_c_name": "Belarus"},
{"c_name": "Russian Empire (Azerbaijan)", "p_c_name": "Azerbaijan"},
{"c_name": "Union of Soviet Socialist Republics (Belarus)", "p_c_name": "Belarus"},
{"c_name": "Southern Rhodesia (Zimbabwe)", "p_c_name": "Zimbabwe"},
{"c_name": "Burma (Myanmar)", "p_c_name": "Myanmar"},
{"c_name": "Russian Empire (Russia)", "p_c_name": "Russia"},
{"c_name": "Hesse-Kassel (Germany)", "p_c_name": "Germany"},
{"c_name": "Austria-Hungary (Croatia)", "p_c_name": "Croatia"},
{"c_name": "Württemberg (Germany)", "p_c_name": "Germany"},
{"c_name": "Free City of Danzig (Poland)", "p_c_name": "Poland"},
{"c_name": "Prussia (Russia)", "p_c_name": "Russia"},
{"c_name": "British India (Bangladesh)", "p_c_name": "Bangladesh"},
{"c_name": "Ottoman Empire (Republic of Macedonia)", "p_c_name": "Macedonia"},
{"c_name": "Austria-Hungary (Austria)", "p_c_name": "Austria"},
{"c_name": "East Friesland (Germany)", "p_c_name": "Germany"},
{"c_name": "Union of Soviet Socialist Republics (Russia)", "p_c_name": "Russia"},
{"c_name": "Java, Dutch East Indies (Indonesia)", "p_c_name": "Indonesia"},
{"c_name": "Russian Empire (Finland)", "p_c_name": "Finland"},
{"c_name": "Scotland", "p_c_name": "United Kingdom"},
{"c_name": "Northern Ireland", "p_c_name": "United Kingdom"},
{"c_name": "Bavaria (Germany)", "p_c_name": "Germany"},
{"c_name": "Austrian Empire (Austria)", "p_c_name": "Austria"},
{"c_name": "Union of Soviet Socialist Republics", "p_c_name": "Russia"},
{"c_name": "Northern Rhodesia (Zambia)", "p_c_name": "Zambia"},
{"c_name": "Czechoslovakia", "p_c_name": "Czech Republic"},
{"c_name": "Yugoslavia (Serbia)", "p_c_name": "Serbia"},
{"c_name": "East Germany", "p_c_name": "Germany"},
{"c_name": "Federal Republic of Germany", "p_c_name": "Germany"},
{"c_name": "Alsace (then Germany, now France)", "p_c_name": "France"},
{"c_name": "Germany (Poland)", "p_c_name": "Germany"},
{"c_name": "Germany (France)", "p_c_name": "Germany"},
{"c_name": "Germany (Russia)", "p_c_name": "Germany"},
{"c_name": "Poland (Ukraine)", "p_c_name": "Poland"},
{"c_name": "Poland (Lithuania)", "p_c_name": "Poland"},
{"c_name": "Poland (Belarus)", "p_c_name": "Poland"},
{"c_name": "Hungary (Slovakia)", "p_c_name": "Hungary"},
{"c_name": "India (Pakistan)", "p_c_name": "India"}]

df_mis_cntr = sp.createDataFrame(data)

## Organizations
Update null City and Country for organizations

In [8]:
df_nobel_orig = df_nobel_orig.withColumn("Organization City", \
                                f.when(f.trim(f.col("Organization Name")) == 'Howard Hughes Medical Institute', 'Durham, NC') \
                                 .when(f.trim(f.col("Organization Name")) == 'University of Delaware', 'Newark, DE') \
                                 .otherwise(f.col("Organization City"))) \
                             .withColumn("Organization Country", \
                                f.when(f.trim(f.col("Organization Name")) == 'Howard Hughes Medical Institute',\
                                       'United States of America') \
                                 .when((f.trim(f.col("Organization Name")) == 'Institut Pasteur') \
                                       & (f.trim(f.col("Organization City")) == 'Tunis'), \
                                       'Tunisia') \
                                 .otherwise(f.col("Organization Country")))

## Parent Organizations
Create a dataframe with matches 'Organization Name' - 'Parent Organization Name'.

In [9]:
data = [{"o_name": "Bell Telephone Laboratories", "p_o_name": "Bell Laboratories"},
{"o_name": "Columbia University Division, Cardio-Pulmonary Laboratory, Bellevue Hospital", "p_o_name": "Columbia University"},
{"o_name": "Duke University Medical Center", "p_o_name": "Duke University"},
{"o_name": "Duke University School of Medicine", "p_o_name": "Duke University"},
{"o_name": "Max-Planck-Institut für Chemie", "p_o_name": "Max Planck Society"},
{"o_name": "Kaiser-Wilhelm-Institut (now Fritz-Haber-Institut) für physikalische Chemie und Electrochemie", "p_o_name": "Max Planck Society"},
{"o_name": "Kaiser-Wilhelm-Institut (now Max-Planck Institut) für Chemie", "p_o_name": "Max Planck Society"},
{"o_name": "Kaiser-Wilhelm-Institut (now Max-Planck-Institut) für Biochemie", "p_o_name": "Max Planck Society"},
{"o_name": "Kaiser-Wilhelm-Institut (now Max-Planck-Institut) für Biologie", "p_o_name": "Max Planck Society"},
{"o_name": "Fritz-Haber-Institut der Max-Planck-Gesellschaft", "p_o_name": "Max Planck Society"},
{"o_name": "Kaiser-Wilhelm-Institut (now Max-Planck-Institut) für Physik", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Biochemie ", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Kohlenforschung (Max-Planck-Institute for Carbon Research)  ", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Quantenoptik", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Biophysik", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Festkörperforschung", "p_o_name": "Max Planck Society"},
{"o_name": "Kaiser-Wilhelm-Institut (now Max-Planck Institut) für Medizinische Forschung", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für medizinische Forschung", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Entwicklungsbiologie", "p_o_name": "Max Planck Society"},
{"o_name": "Max Planck Institute for Biophysical Chemistry", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Biophysikalische Chemie", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Physikalische Chemie", "p_o_name": "Max Planck Society"},
{"o_name": "Max-Planck-Institut für Zellchemie", "p_o_name": "Max Planck Society"},
{"o_name": "Harvard University, Biological Laboratories", "p_o_name": "Harvard University"},
{"o_name": "Harvard University, Lyman Laboratory", "p_o_name": "Harvard University"},
{"o_name": "Imperial Cancer Research Fund Laboratory", "p_o_name": "Imperial Cancer Research Fund"},
{"o_name": "Johns Hopkins University School of Medicine", "p_o_name": "Johns Hopkins University"},
{"o_name": "Karolinska Institutet, Nobel Medical Institute", "p_o_name": "Karolinska Institutet"},
{"o_name": "London University, King's College Hospital Medical School", "p_o_name": "London University"},
{"o_name": "Massachusetts Institute of Technology (MIT), Center for Cancer Research", "p_o_name": "Massachusetts Institute of Technology (MIT)"},
{"o_name": "New York University, College of Medicine", "p_o_name": "New York University"},
{"o_name": "Rockefeller Institute for Medical Research", "p_o_name": "Rockefeller University"},
{"o_name": "Sorbonne University, Institut Henri Poincaré", "p_o_name": "Sorbonne University"},
{"o_name": "Stanford University School of Medicine", "p_o_name": "Stanford University"},
{"o_name": "University of California School of Medicine", "p_o_name": "University of California"},
{"o_name": "University of California, Kavli Institute for Theoretical Physics", "p_o_name": "University of California"},
{"o_name": "University of Chicago, Ben May Laboratory for Cancer Research", "p_o_name": "University of Chicago"},
{"o_name": "University of Colorado, JILA", "p_o_name": "University of Colorado"},
{"o_name": "University of Freiburg", "p_o_name": "University of Freiburg im Breisgau"},
{"o_name": "University of Oxford, Royal Society", "p_o_name": "University of Oxford"},
{"o_name": "University of Texas Medical School at Houston", "p_o_name": "ttttttt"},
{"o_name": "University of Texas Southwestern Medical Center at Dallas", "p_o_name": "University of Texas"},
{"o_name": "University of Zurich, Institute of Experimental Immunology", "p_o_name": "University of Zurich"},
{"o_name": "Vanderbilt University School of Medicine", "p_o_name": "Vanderbilt University"},
{"o_name": "Yale University, School of Medicine", "p_o_name": "Yale University"}]

df_par_org = sp.createDataFrame(data)

## Cities
Add CityUpd column with US Cities in a format 'City, Region' for joining to the dataset.<br>
Create a dataframe with mismatches in Cities and Nobel Laureate dataframes.

In [10]:
# CityUpd column
df_cities_orig = df_cities_orig.withColumn("CityUpd", \
                              f.when(f.col("Country") == 'us', f.concat_ws(", ","AccentCity",'Region')) \
                               .otherwise(f.col("AccentCity")))

In [11]:
# Dataframe with mismatches. Columns: 
# 'mis_city_name' - city name in the Nobel Laureates dataframe.
# 'orig_city_name', 'orig_region', 'orig_country' - city, region, country names in the Cities dataframe.
data = [{"mis_city_name": "Casteldàwson", "orig_city_name": "Castledawson", "orig_region": "S7", "orig_country": "gb"},
{"mis_city_name": "Grand Valley, CO", "orig_city_name": "Green Valley Acres", "orig_region": "CO", "orig_country": "us"},
{"mis_city_name": "Yamanashi Prefecture", "orig_city_name": "Yamanashi", "orig_region": "37", "orig_country": "jp"},
{"mis_city_name": "Amherst, NS", "orig_city_name": "Amherst", "orig_region": "07", "orig_country": "ca"},
{"mis_city_name": "Champaign-Urbana, IL", "orig_city_name": "Champaign", "orig_region": "IL", "orig_country": "us"},
{"mis_city_name": "Danzig (Gdansk)", "orig_city_name": "Danzig", "orig_region": "82", "orig_country": "pl"},
{"mis_city_name": "Fleräng", "orig_city_name": "Uppsala", "orig_region": "21", "orig_country": "se"},
{"mis_city_name": "Kattowitz (Katowice)", "orig_city_name": "Kattowitz", "orig_region": "83", "orig_country": "pl"},
{"mis_city_name": "Hobart, Tasmania", "orig_city_name": "Hobart", "orig_region": "06", "orig_country": "au"},
{"mis_city_name": "Leningrad (Saint Petersburg)", "orig_city_name": "Leningrad", "orig_region": "66", "orig_country": "ru"},
{"mis_city_name": "Gaffken (Parusnoye)", "orig_city_name": "Gaffken", "orig_region": "23", "orig_country": "ru"},
{"mis_city_name": "Rufford, near Chesterfield", "orig_city_name": "Rufford", "orig_region": "H2", "orig_country": "gb"},
{"mis_city_name": "Goldschmieden, near Breslau", "orig_city_name": "Breslau", "orig_region": "72", "orig_country": "pl"},
{"mis_city_name": "Kibbutz Sde-Nahum", "orig_city_name": "Sede Nahum", "orig_region": "03", "orig_country": "il"},
{"mis_city_name": "Hansdorf (Lawice)", "orig_city_name": "Hansdorf", "orig_region": "85", "orig_country": "pl"},
{"mis_city_name": "Toyama City", "orig_city_name": "Toyama", "orig_region": "08", "orig_country": "jp"},
{"mis_city_name": "St. Petersburg", "orig_city_name": "Sankt-Peterburg", "orig_region": "66", "orig_country": "ru"},
{"mis_city_name": "Aldea Chimel", "orig_city_name": "Chimel", "orig_region": "01", "orig_country": "gt"},
{"mis_city_name": "Vicuña", "orig_city_name": "Vicuna", "orig_region": "07", "orig_country": "cl"},
{"mis_city_name": "Dippenhall", "orig_city_name": "Farnham", "orig_region": "E4", "orig_country": "gb"},
{"mis_city_name": "Königshütte (Chorzów)", "orig_city_name": "Königshütte", "orig_region": "14", "orig_country": "de"},
{"mis_city_name": "Skedsmo", "orig_city_name": "Skedsmokorset", "orig_region": "01", "orig_country": "no"},
{"mis_city_name": "Kingston, ON", "orig_city_name": "Kingston", "orig_region": "08", "orig_country": "ca"},
{"mis_city_name": "Leggiuno-Sangiano", "orig_city_name": "Varese", "orig_region": "09", "orig_country": "it"},
{"mis_city_name": "Taktser", "orig_city_name": "Qinghaihu", "orig_region": "06", "orig_country": "cn"},
{"mis_city_name": "Gränichen", "orig_city_name": "Granichen", "orig_region": "01", "orig_country": "ch"},
{"mis_city_name": "Neisse (Nysa)", "orig_city_name": "Neisse", "orig_region": "79", "orig_country": "pl"},
{"mis_city_name": "St. Columb Minor", "orig_city_name": "Saint Columb Minor", "orig_region": "C6", "orig_country": "gb"},
{"mis_city_name": "Zelvas", "orig_city_name": "Zelva", "orig_region": "60", "orig_country": "lt"},
{"mis_city_name": "Olympus, TN", "orig_city_name": "Byrdstown", "orig_region": "TN", "orig_country": "us"},
{"mis_city_name": "Sorau (Zory)", "orig_city_name": "Sorau in Nieder Lausitz", "orig_region": "76", "orig_country": "pl"},
{"mis_city_name": "Gjesdal", "orig_city_name": "Ålgård", "orig_region": "14", "orig_country": "no"},
{"mis_city_name": "Viipuri (Vyborg)", "orig_city_name": "Viipuri", "orig_region": "42", "orig_country": "ru"},
{"mis_city_name": "Kvikne", "orig_city_name": "Kviknepladsen", "orig_region": "06", "orig_country": "no"},
{"mis_city_name": "Nuoro, Sardinia", "orig_city_name": "Nuoro", "orig_region": "14", "orig_country": "it"},
{"mis_city_name": "Priluka (Nova Pryluka)", "orig_city_name": "Priluka Novaya", "orig_region": "23", "orig_country": "ua"},
{"mis_city_name": "Laibach (Ljubljana)", "orig_city_name": "Laibach", "orig_region": "04", "orig_country": "si"},
{"mis_city_name": "Smyrna (Izmir)", "orig_city_name": "Smyrna", "orig_region": "35", "orig_country": "tr"},
{"mis_city_name": "Mexico City", "orig_city_name": "Mexico", "orig_region": "09", "orig_country": "mx"},
{"mis_city_name": "Timmins, ON", "orig_city_name": "Timmins", "orig_region": "08", "orig_country": "ca"},
{"mis_city_name": "San José, CA", "orig_city_name": "San Jose", "orig_region": "CA", "orig_country": "us"},
{"mis_city_name": "Jamaica Plain, MA (Boston)", "orig_city_name": "Jamaica Plain", "orig_region": "MA", "orig_country": "us"},
{"mis_city_name": "Nitzkydorf, Banat", "orig_city_name": "Nitzkydorf", "orig_region": "36", "orig_country": "ro"},
{"mis_city_name": "Waltersdorf (Niegoslawice)", "orig_city_name": "Waltersdorf", "orig_region": "76", "orig_country": "pl"},
{"mis_city_name": "Agrigento, Sicily", "orig_city_name": "Agrigento", "orig_region": "15", "orig_country": "it"},
{"mis_city_name": "Medicine Hat, Alberta", "orig_city_name": "Medicine Hat", "orig_region": "01", "orig_country": "ca"},
{"mis_city_name": "&#346;eteniai", "orig_city_name": "Kedainiai", "orig_region": "60", "orig_country": "lt"},
{"mis_city_name": "Kharkov (Kharkiv)", "orig_city_name": "Kharkov", "orig_region": "07", "orig_country": "ua"},
{"mis_city_name": "Wailacama", "orig_city_name": "Dukuh Kupang Timur", "orig_region": "08", "orig_country": "id"},
{"mis_city_name": "Pressburg (Bratislava)", "orig_city_name": "Pressburg", "orig_region": "02", "orig_country": "sk"},
{"mis_city_name": "Lennep (Remscheid)", "orig_city_name": "Lennep", "orig_region": "07", "orig_country": "de"},
{"mis_city_name": "Higashimatsuyama", "orig_city_name": "Higashi-Matsuyama", "orig_region": "34", "orig_country": "jp"},
{"mis_city_name": "Nam Ha province", "orig_city_name": "Phu Ly", "orig_region": "80", "orig_country": "vn"},
{"mis_city_name": "Cheetham Hill, near Manchester  ", "orig_city_name": "Manchester", "orig_region": "I2", "orig_country": "gb"},
{"mis_city_name": "Kingston Hill", "orig_city_name": "London", "orig_region": "H9", "orig_country": "gb"},
{"mis_city_name": "Hofei, Anhwei", "orig_city_name": "Hefei", "orig_region": "01", "orig_country": "cn"},
{"mis_city_name": "St. Louis, MO", "orig_city_name": "Saint Louis", "orig_region": "MO", "orig_country": "us"},
{"mis_city_name": "Dabrovica", "orig_city_name": "Dabrowica", "orig_region": "77", "orig_country": "pl"},
{"mis_city_name": "Mount Verno, NY", "orig_city_name": "Mount Vernon", "orig_region": "NY", "orig_country": "us"},
{"mis_city_name": "Ta'izz", "orig_city_name": "Taiz", "orig_region": "11", "orig_country": "ye"},
{"mis_city_name": "Mit Abu al-Kawm", "orig_city_name": "Mit Abu al Kawm", "orig_region": "09", "orig_country": "eg"},
{"mis_city_name": "Chidambaram, Tamil Nadu", "orig_city_name": "Chidambaram", "orig_region": "25", "orig_country": "in"},
{"mis_city_name": "Tananarive (Antananarivo)", "orig_city_name": "Tananarive", "orig_region": "05", "orig_country": "mg"},
{"mis_city_name": "Frankfurt-on-the-Main", "orig_city_name": "Frankfurt am Main", "orig_region": "05", "orig_country": "de"},
{"mis_city_name": "Hoechst", "orig_city_name": "Frankfurt am Main", "orig_region": "05", "orig_country": "de"},
{"mis_city_name": "Jhang Maghi&#257;na", "orig_city_name": "Jhang", "orig_region": "12", "orig_country": "in"},
{"mis_city_name": "Zhejiang Ningbo", "orig_city_name": "Ningbo", "orig_region": "14", "orig_country": "cn"},
{"mis_city_name": "Wilno (Vilnius)", "orig_city_name": "Wilno", "orig_region": "65", "orig_country": "lt"},
{"mis_city_name": "Langford Grove, Maldon, Essex", "orig_city_name": "Langford", "orig_region": "E4  ", "orig_country": "gb"},
{"mis_city_name": "Clausthal (Clausthal-Zellerfeld)", "orig_city_name": "Clausthal", "orig_region": "06", "orig_country": "de"},
{"mis_city_name": "Strassburg (Strasbourg)", "orig_city_name": "Strassburg", "orig_region": "C1  ", "orig_country": "fr"},
{"mis_city_name": "Rangoon (Yangon)", "orig_city_name": "Rangoon", "orig_region": "17", "orig_country": "mm"},
{"mis_city_name": "Strehlen (Strzelin)", "orig_city_name": "Strehlen", "orig_region": "72", "orig_country": "pl"},
{"mis_city_name": "Buczacz (Buchach)", "orig_city_name": "Buczacz", "orig_region": "22", "orig_country": "ua"},
{"mis_city_name": "Kalgoorlie", "orig_city_name": "Boulder", "orig_region": "08", "orig_country": "au"},
{"mis_city_name": "Nizhny Tagil", "orig_city_name": "Nizhnii Tagil", "orig_region": "71", "orig_country": "ru"},
{"mis_city_name": "Bremerhaven-Lehe", "orig_city_name": "Bremerhaven", "orig_region": "03", "orig_country": "de"},
{"mis_city_name": "Corteno", "orig_city_name": "Brescia", "orig_region": "09", "orig_country": "it"},
{"mis_city_name": "Windsor, ON", "orig_city_name": "Windsor", "orig_region": "08", "orig_country": "ca"},
{"mis_city_name": "Bnin (Kórnik)", "orig_city_name": "Bnin", "orig_region": "86", "orig_country": "pl"},
{"mis_city_name": "Iria Flavia", "orig_city_name": "Iria", "orig_region": "58", "orig_country": "es"},
{"mis_city_name": "Breslau (Wroclaw)", "orig_city_name": "Breslau", "orig_region": "72", "orig_country": "pl"},
{"mis_city_name": "Uskup (Skopje)", "orig_city_name": "Üsküp", "orig_region": "39", "orig_country": "tr"},
{"mis_city_name": "Koenigsberg (Kaliningrad)", "orig_city_name": "Königsberg", "orig_region": "23", "orig_country": "ru"},
{"mis_city_name": "Reykjavik", "orig_city_name": "Reykjavík", "orig_region": "10", "orig_country": "is"},
{"mis_city_name": "Petilla de Aragó", "orig_city_name": "Petilla de Aragón", "orig_region": "32", "orig_country": "es"},
{"mis_city_name": "'s Graveland", "orig_city_name": "'s-Graveland", "orig_region": "07", "orig_country": "nl"},
{"mis_city_name": "Zloczov", "orig_city_name": "Zloczow", "orig_region": "15", "orig_country": "ua"},
{"mis_city_name": "Vishneva", "orig_city_name": "Wisniowa", "orig_region": "77", "orig_country": "pl"},
{"mis_city_name": "St. Paul, MN", "orig_city_name": "Saint Paul", "orig_region": "MN", "orig_country": "us"},
{"mis_city_name": "Kristiania (Oslo)", "orig_city_name": "Kristiania", "orig_region": "12", "orig_country": "no"},
{"mis_city_name": "Rendcombe", "orig_city_name": "Cirencester", "orig_region": "E6", "orig_country": "gb"},
{"mis_city_name": "Vitebsk, Belorussia", "orig_city_name": "Vitebsk", "orig_region": "07", "orig_country": "by"},
{"mis_city_name": "Mürzzuschlag", "orig_city_name": "Murzzuschlag", "orig_region": "06", "orig_country": "at"},
{"mis_city_name": "Val di Castello", "orig_city_name": "Pietrasanta", "orig_region": "16", "orig_country": "it"},
{"mis_city_name": "Glencorse", "orig_city_name": "Midlothian", "orig_region": "V5", "orig_country": "gb"},
{"mis_city_name": "Strelno (Strzelno)", "orig_city_name": "Strelno", "orig_region": "73", "orig_country": "pl"},
{"mis_city_name": "Lochfield", "orig_city_name": "Lichfield", "orig_region": "N1", "orig_country": "gb"},
{"mis_city_name": "the Hague", "orig_city_name": "The Hague", "orig_region": "11", "orig_country": "nl"},
{"mis_city_name": "Ivano-Frankivsk", "orig_city_name": "IvanoFrankivsk", "orig_region": "06", "orig_country": "ua"},
{"mis_city_name": "Victoria, BC", "orig_city_name": "Victoria", "orig_region": "02", "orig_country": "ca"},
{"mis_city_name": "Lethbridge, Alberta", "orig_city_name": "Lethbridge", "orig_region": "01", "orig_country": "ca"},
{"mis_city_name": "Wickenberg, AZ", "orig_city_name": "Wickenburg", "orig_region": "AZ", "orig_country": "us"},
{"mis_city_name": "Grantchester", "orig_city_name": "Cambridgeshire", "orig_region": "C3", "orig_country": "gb"},
{"mis_city_name": "Wakulla Springs State Park, FL  ", "orig_city_name": "Wakulla Springs", "orig_region": "FL", "orig_country": "us"},
{"mis_city_name": "Presqu'île-de-Giens", "orig_city_name": "Giens", "orig_region": "B8", "orig_country": "fr"},
{"mis_city_name": "Newfoundland", "orig_city_name": "Saint John's", "orig_region": "05", "orig_country": "ca"},
{"mis_city_name": "New Jersey, NJ", "orig_city_name": "Jersey City", "orig_region": "NJ", "orig_country": "us"},
{"mis_city_name": "Penrhyndeudraeth", "orig_city_name": "Penrhyndeudreath", "orig_region": "Y2", "orig_country": "gb"},
{"mis_city_name": "Perranarworthal", "orig_city_name": "County of Cornwall", "orig_region": "C6", "orig_country": "gb"},
{"mis_city_name": "Llangarron", "orig_city_name": "Ross on Wye", "orig_region": "F7", "orig_country": "gb"},
{"mis_city_name": "Kraków", "orig_city_name": "Krakow", "orig_region": "77", "orig_country": "pl"},
{"mis_city_name": "Hamilton, Ontario", "orig_city_name": "Hamilton", "orig_region": "08", "orig_country": "ca"},
{"mis_city_name": "Putney Heath", "orig_city_name": "London", "orig_region": "H9", "orig_country": "gb"},
{"mis_city_name": "Shipston-on-Stour", "orig_city_name": "Shipston on Stour", "orig_region": "P3", "orig_country": "gb"},
{"mis_city_name": "Brive-Corrèze", "orig_city_name": "Brive-la-Gaillarde", "orig_region": "B1", "orig_country": "fr"},
{"mis_city_name": "Övralid", "orig_city_name": "Motala", "orig_region": "16", "orig_country": "se"},
{"mis_city_name": "Bornheim-Merten", "orig_city_name": "Merten", "orig_region": "07", "orig_country": "de"},
{"mis_city_name": "Truro, NS", "orig_city_name": "Truro", "orig_region": "07", "orig_country": "ca"},
{"mis_city_name": "Waterford, CT", "orig_city_name": "New London", "orig_region": "CT", "orig_country": "us"},
{"mis_city_name": "West Berlin", "orig_city_name": "Berlin", "orig_region": "16", "orig_country": "de"},
{"mis_city_name": "Palma, Majorca", "orig_city_name": "Palma de Mallorca", "orig_region": "07", "orig_country": "es"},
{"mis_city_name": "Ayot St. Lawrence", "orig_city_name": "Ayot Saint Lawrence", "orig_region": "F8", "orig_country": "gb"},
{"mis_city_name": "Moffett Field, CA", "orig_city_name": "Santa Clara", "orig_region": "CA", "orig_country": "us"},
{"mis_city_name": "Lewes, East Sussex", "orig_city_name": "Lewes", "orig_region": "E2", "orig_country": "gb"},
{"mis_city_name": "Agnetendorf (Jagniatków)", "orig_city_name": "Jelenia Gora", "orig_region": "72", "orig_country": "pl"},
{"mis_city_name": "Research Triangle Park, NC", "orig_city_name": "Raleigh", "orig_region": "NC", "orig_country": "us"},
{"mis_city_name": "Bucksburn (Scotland)", "orig_city_name": "Bucksburn", "orig_region": "T5", "orig_country": "gb"},
{"mis_city_name": "Lidingö-Stockholm", "orig_city_name": "Lidingö", "orig_region": "26", "orig_country": "se"},
{"mis_city_name": "Breisgau", "orig_city_name": "Freiburg im Breisgau", "orig_region": "01", "orig_country": "de"},
{"mis_city_name": "J&uuml;lich", "orig_city_name": "Jülich", "orig_region": "07", "orig_country": "de"},
{"mis_city_name": "Mannheim-Rheinau", "orig_city_name": "Rheinau", "orig_region": "01", "orig_country": "de"},
{"mis_city_name": "Sapporo", "orig_city_name": "Sapporo-shi", "orig_region": "12", "orig_country": "jp"},
{"mis_city_name": "Berlin-Dahlem", "orig_city_name": "Dahlem", "orig_region": "16", "orig_country": "de"},
{"mis_city_name": "Long Island, New York, NY", "orig_city_name": "Long Island City", "orig_region": "NY", "orig_country": "us"},
{"mis_city_name": "Harwell, Berkshire", "orig_city_name": "Harwell", "orig_region": "K2", "orig_country": "gb"},
{"mis_city_name": "Argonne, IL", "orig_city_name": "Lemont", "orig_region": "IL", "orig_country": "us"},
{"mis_city_name": "Altenberg; Grünau im Almtal", "orig_city_name": "Altenberg", "orig_region": "04", "orig_country": "at"},
{"mis_city_name": "Massachusetts, MA", "orig_city_name": "Boston", "orig_region": "MA", "orig_country": "us"},
{"mis_city_name": "Mülheim/Ruhr", "orig_city_name": "Mülheim an der Ruhr", "orig_region": "07", "orig_country": "de"},
{"mis_city_name": "Rüschlikon", "orig_city_name": "Zurich", "orig_region": "25", "orig_country": "ch"},    
{"mis_city_name": "Kyoto", "orig_city_name": "Kyoto-shi", "orig_region": "22", "orig_country": "jp"},
{"mis_city_name": "Guebwiller", "orig_city_name": "Guebwiller", "orig_region": "C1", "orig_country": "fr"},
{"mis_city_name": "Lahore", "orig_city_name": "Lahore", "orig_region": "02", "orig_country": "pk"},
{"mis_city_name": "Thorshavn", "orig_city_name": "Thorshavn", "orig_region": "00", "orig_country": "fo"},
{"mis_city_name": "Cluny", "orig_city_name": "Offord Cluny", "orig_region": "C3", "orig_country": "gb"},
{"mis_city_name": "Lagow", "orig_city_name": "Lagow", "orig_region": "72", "orig_country": "pl"},
{"mis_city_name": "Neuchâtel", "orig_city_name": "Neuchâtel", "orig_region": "A6", "orig_country": "fr"},
{"mis_city_name": "Kaysersberg", "orig_city_name": "Kaysersberg", "orig_region": "C1", "orig_country": "fr"},
{"mis_city_name": "Dili", "orig_city_name": "Dukuh Kupang Timur", "orig_region": "08", "orig_country": "id"},
{"mis_city_name": "Bad Salzbrunn", "orig_city_name": "Bad Salzbrunn", "orig_region": "72", "orig_country": "pl"},
{"mis_city_name": "San Juan", "orig_city_name": "San Juan", "orig_region": "PR", "orig_country": "us"},
{"mis_city_name": "Lanzarote", "orig_city_name": "Arrecife", "orig_region": "53", "orig_country": "es"},
{"mis_city_name": "Hong Kong", "orig_city_name": "Hong Kong", "orig_region": "00", "orig_country": "hk"}]

df_tmp_mis_cities = sp.createDataFrame(data)

# Create dataset for data analysis
Collect data into one dataframe for the future analysis. Contains information about Nobel Prizes with all attributes: countries and cities information, organizations and laureate persons, country codes, etc.

## Use Nobel Prize dataframe as main

In [12]:
# id is the internal column for grouping, remove it later
df = df_nobel_orig.sort("Year", "Laureate ID") \
                  .withColumn("id", f.monotonically_increasing_id()) \
                  .withColumnRenamed("Year", "year") \
                  .withColumnRenamed("Category", "category") \
                  .withColumnRenamed("Prize", "prize") \
                  .withColumnRenamed("Motivation", "motivation") \
                  .withColumnRenamed("Prize Share", "prize_share") \
                  .withColumnRenamed("Laureate ID", "laureate_id") \
                  .withColumnRenamed("Laureate Type", "laureate_type") \
                  .withColumnRenamed("Full Name", "full_name") \
                  .withColumnRenamed("Birth Date", "birth_date") \
                  .withColumnRenamed("Birth City", "birth_city_name") \
                  .withColumnRenamed("Birth Country", "birth_country_name") \
                  .withColumnRenamed("Sex", "gender") \
                  .withColumnRenamed("Organization Name", "organization_name") \
                  .withColumnRenamed("Organization City", "organization_city_name") \
                  .withColumnRenamed("Organization Country", "organization_country_name") \
                  .withColumnRenamed("Death Date", "death_date") \
                  .withColumnRenamed("Death City", "death_city_name") \
                  .withColumnRenamed("Death Country", "death_country_name") \
                  .withColumn("laureate_person_name", \
                       f.when(f.col("laureate_type") == 'Individual', f.col("full_name")) \
                        .otherwise(f.lit(None))) \
                  .withColumn("society_name", \
                       f.when(f.col("laureate_type") == 'Organization', f.col("full_name")) \
                        .otherwise(f.lit(None))) \
                  .select("id", "laureate_id", "year", "category", "prize", "prize_share", "laureate_type", "motivation", \
                          "laureate_person_name", "gender", "society_name", 
                          "birth_date", "birth_city_name", "birth_country_name", \
                          "death_date", "death_city_name", "death_country_name", \
                          "organization_name", "organization_city_name", "organization_country_name")

In [13]:
# Check the count of records in the dataset
df.count()

969

## Add Parent Countries and Organizations
Parent country name added for historical names of country (or if specified a name of country part). If country has a modern name, parent name = country name.<br>
Parent Organizarion name added for parent names of organizations. If organization doesn't have a parent organizarion, parent name = country name.<br>
In a result, *parent_name* columns contain a modern/parent name, *name* columns contain a historical/child name.<br>
This gives more opportunities to analyze data by country and organization.

In [14]:
# Join Birth Country Parent Name
join_expr = f.trim(df["birth_country_name"]) == f.trim(df_mis_cntr["c_name"])
df = df.join(df_mis_cntr, join_expr, "left_outer") \
       .withColumn("birth_country_parent_name", \
            f.when(c.isNull(f.col("p_c_name")), f.col("birth_country_name")) \
             .otherwise(f.col("p_c_name"))) \
       .drop("c_name", "p_c_name")

# Join Death Country Parent Name
join_expr = f.trim(df["death_country_name"]) == f.trim(df_mis_cntr["c_name"])
df = df.join(df_mis_cntr, join_expr, "left_outer") \
       .withColumn("death_country_parent_name", \
            f.when(c.isNull(f.col("p_c_name")), f.col("death_country_name")) \
             .otherwise(f.col("p_c_name"))) \
       .drop("c_name", "p_c_name")

# Join Organization Country Parent Name
join_expr = f.trim(df["organization_country_name"]) == f.trim(df_mis_cntr["c_name"])
df = df.join(df_mis_cntr, join_expr, "left_outer") \
       .withColumn("organization_country_parent_name", \
            f.when(c.isNull(f.col("p_c_name")), f.col("organization_country_name")) \
             .otherwise(f.col("p_c_name"))) \
       .drop("c_name", "p_c_name")

In [15]:
# Check null values
df.where("(birth_country_parent_name is null and birth_country_name is not null) \
       or (death_country_parent_name is null and death_country_name is not null) \
       or (organization_country_parent_name is null and organization_country_name is not null)").count()

0

Parent organization name added for child organizations. if there is no Parent Organization, parent organization = Organization name.

In [16]:
#Join Organization Parent Name
join_expr = f.trim(df["organization_name"]) == f.trim(df_par_org["o_name"])
df = df.join(df_par_org, join_expr, "left_outer") \
       .withColumn("organization_parent_name", \
            f.when(c.isNull(f.col("p_o_name")), f.col("organization_name")) \
             .otherwise(f.col("p_o_name"))) \
       .drop("o_name", "p_o_name")

In [17]:
# Check null values
df.where("organization_parent_name is null and organization_name is not null").count()

0

## Add Country information
Join Country information by Parent Country Name columns that contain modern country names.

### Birth Country
Add all information about birth place for analysis according to the conditions of the birth of the laureate

In [18]:
join_expr = f.trim(df["birth_country_parent_name"]) == f.trim(df_countries_orig["Country"])
df = df.join(df_countries_orig, join_expr, "left_outer") \
       .withColumnRenamed("continent_name", "birth_continent_name") \
       .withColumnRenamed("Population", "birth_country_population") \
       .withColumnRenamed("Region", "birth_region_name") \
       .withColumnRenamed("Area (sq. mi.)", "birth_country_area_sq_miles") \
       .withColumnRenamed("Pop. Density (per sq. mi.)", "birth_country_pop_dencity_per_sq_mile") \
       .withColumnRenamed("Coastline (coast/area ratio)", "birth_country_coastline") \
       .withColumnRenamed("Net migration", "birth_country_net_migration") \
       .withColumnRenamed("Infant mortality (per 1000 births)", "birth_country_infant_mortality_per_1000") \
       .withColumnRenamed("GDP ($ per capita)", "birth_country_gdb") \
       .withColumnRenamed("Literacy (%)", "birth_country_percent_literacy") \
       .withColumnRenamed("Phones (per 1000)", "birth_country_phones_per_1000") \
       .withColumnRenamed("Arable (%)", "birth_country_percent_arable") \
       .withColumnRenamed("Crops (%)", "birth_country_percent_crops") \
       .withColumnRenamed("Other (%)", "birth_country_percent_other") \
       .withColumnRenamed("Climate", "birth_country_climate") \
       .withColumnRenamed("Birthrate", "birth_country_birthrate") \
       .withColumnRenamed("Deathrate", "birth_country_deathrate") \
       .withColumnRenamed("Agriculture", "birth_country_agriculture") \
       .withColumnRenamed("Industry", "birth_country_industry") \
       .withColumnRenamed("Service", "birth_country_service") \
       .withColumnRenamed("country_code", "birth_country_code") \
       .drop("Country")

In [19]:
# Check null values
df.where("birth_country_parent_name is not null and birth_continent_name is null").count()

0

### Death Country
Add only main and geographical information about death place for analysis according to the conditions of the end of life of the laureate

In [20]:
join_expr = f.trim(df["death_country_parent_name"]) == f.trim(df_countries_orig["Country"])
df = df.join(df_countries_orig, join_expr, "left_outer") \
       .withColumnRenamed("continent_name", "death_continent_name") \
       .withColumnRenamed("Population", "death_country_population") \
       .withColumnRenamed("Region", "death_region_name") \
       .withColumnRenamed("Coastline (coast/area ratio)", "death_country_coastline") \
       .withColumnRenamed("GDP ($ per capita)", "death_country_gdb") \
       .withColumnRenamed("Climate", "death_country_climate") \
       .withColumnRenamed("country_code", "death_country_code") \
       .drop("Country", "Area (sq. mi.)", "Pop. Density (per sq. mi.)", "Net migration", "Infant mortality (per 1000 births)", \
             "Literacy (%)", "Phones (per 1000)", "Arable (%)", "Crops (%)", "Other (%)", "Birthrate", "Deathrate", \
             "Agriculture", "Industry", "Service")

In [21]:
# Check null values
df.where("death_country_parent_name is not null and death_continent_name is null").count()

0

### Organization Country
Add only main information about organization place for analysis according to the conditions of the work of the laureate

In [22]:
join_expr = f.trim(df["organization_country_parent_name"]) == f.trim(df_countries_orig["Country"])
df = df.join(df_countries_orig, join_expr, "left_outer") \
       .withColumnRenamed("continent_name", "organization_continent_name") \
       .withColumnRenamed("Population", "organization_country_population") \
       .withColumnRenamed("Region", "organization_region_name") \
       .withColumnRenamed("GDP ($ per capita)", "organization_country_gdb") \
       .withColumnRenamed("country_code", "organization_country_code") \
       .drop("Country", "Area (sq. mi.)", "Pop. Density (per sq. mi.)", "Net migration", "Infant mortality (per 1000 births)", \
             "Literacy (%)", "Phones (per 1000)", "Arable (%)", "Crops (%)", "Other (%)", "Birthrate", "Deathrate", \
             "Agriculture", "Industry", "Service", "Coastline (coast/area ratio)", "Climate")

In [23]:
# Check null values
df.where("organization_country_parent_name is not null and organization_continent_name is null").count()

0

In [24]:
# Check the count of records in the dataset
df.count()

969

## Add City information

### Birth City

Join the dataframe with mismatches df_tmp_mis_cities to the main dataframe

In [25]:
join_expr = f.trim(df["birth_city_name"]) == f.trim(df_tmp_mis_cities["mis_city_name"])
df = df.join(df_tmp_mis_cities, join_expr, "left_outer").drop("mis_city_name")

Join the dataframe Cities for mismatches by the columns in the dataframe df_tmp_mis_cities

In [26]:
join_expr = (f.trim(df["orig_city_name"]) == f.trim(df_cities_orig["AccentCity"])) \
            & (f.trim(df["orig_region"]) == f.trim(df_cities_orig["Region"])) \
            & (f.trim(df["orig_country"]) == f.trim(df_cities_orig["Country"]))

df = df.join(df_cities_orig, join_expr, "left_outer") \
       .withColumnRenamed("Region", "city_region_tmp") \
       .withColumnRenamed("Population", "city_population_tmp") \
       .withColumnRenamed("Latitude", "city_latitude_tmp") \
       .withColumnRenamed("Longitude", "city_longitude_tmp") \
       .drop("orig_city_name", "orig_country", "orig_region", "Country", "City", "AccentCity", "CityUpd")

Join the dataframe Cities for matches by the columns CityUpd and Country

In [27]:
join_expr = (f.trim(df["birth_city_name"]) == f.trim(df_cities_orig["CityUpd"])) \
            & (f.trim(df["birth_country_code"]) == f.trim(df_cities_orig["Country"]))

# City columns: if a value from the df_tmp_mis_cities is not null, then take it, else take a value from the df_cities_orig
df = df.join(df_cities_orig, join_expr, "left_outer") \
       .withColumn("birth_city_region", \
            f.when(c.isNotNull(f.col("city_region_tmp")), f.col("city_region_tmp")) \
             .otherwise(f.col("Region"))) \
       .withColumn("birth_city_population", \
            f.when(c.isNotNull(f.col("city_population_tmp")), f.col("city_population_tmp")) \
             .otherwise(f.col("Population"))) \
       .withColumn("birth_city_latitude", \
            f.when(c.isNotNull(f.col("city_latitude_tmp")), f.col("city_latitude_tmp")) \
             .otherwise(f.col("Latitude"))) \
       .withColumn("birth_city_longitude", \
            f.when(c.isNotNull(f.col("city_longitude_tmp")), f.col("city_longitude_tmp")) \
             .otherwise(f.col("Longitude"))) \
       .drop("birth_country_code", "city_region_tmp", "city_population_tmp", "city_latitude_tmp", "city_longitude_tmp", \
             "Country", "City", "AccentCity", "Region", "Population", "Latitude", "Longitude", "CityUpd")

# Some cities are not unique by "City Name" + "Country". Remove duplicates from the dataframe, use the first number of City ID
w = Window.partitionBy("id").orderBy(f.col("id"))
df = df.withColumn("row", f.row_number().over(w)).filter(f.col("row") == 1).drop("row")

In [28]:
# Check null values
df.where("birth_city_name is not null and birth_city_region is null").count()

0

### Death City

Join the dataframe with mismatches df_tmp_mis_cities to the main dataframe

In [29]:
join_expr = f.trim(df["death_city_name"]) == f.trim(df_tmp_mis_cities["mis_city_name"])
df = df.join(df_tmp_mis_cities, join_expr, "left_outer").drop("mis_city_name")

Join the dataframe Cities for mismatches by the columns in the dataframe df_tmp_mis_cities

In [30]:
join_expr = (f.trim(df["orig_city_name"]) == f.trim(df_cities_orig["AccentCity"])) \
            & (f.trim(df["orig_region"]) == f.trim(df_cities_orig["Region"])) \
            & (f.trim(df["orig_country"]) == f.trim(df_cities_orig["Country"]))

df = df.join(df_cities_orig, join_expr, "left_outer") \
       .withColumnRenamed("Region", "city_region_tmp") \
       .withColumnRenamed("Population", "city_population_tmp") \
       .withColumnRenamed("Latitude", "city_latitude_tmp") \
       .withColumnRenamed("Longitude", "city_longitude_tmp") \
       .drop("orig_city_name", "orig_country", "orig_region", "Country", "City", "AccentCity", "CityUpd")

Join the dataframe Cities for matches by the columns CityUpd and Country

In [31]:
join_expr = (f.trim(df["death_city_name"]) == f.trim(df_cities_orig["CityUpd"])) \
            & (f.trim(df["death_country_code"]) == f.trim(df_cities_orig["Country"]))

# City columns: if a value from the df_tmp_mis_cities is not null, then take it, else take a value from the df_cities_orig
df = df.join(df_cities_orig, join_expr, "left_outer") \
       .withColumn("death_city_region", \
            f.when(c.isNotNull(f.col("city_region_tmp")), f.col("city_region_tmp")) \
             .otherwise(f.col("Region"))) \
       .withColumn("death_city_population", \
            f.when(c.isNotNull(f.col("city_population_tmp")), f.col("city_population_tmp")) \
             .otherwise(f.col("Population"))) \
       .withColumn("death_city_latitude", \
            f.when(c.isNotNull(f.col("city_latitude_tmp")), f.col("city_latitude_tmp")) \
             .otherwise(f.col("Latitude"))) \
       .withColumn("death_city_longitude", \
            f.when(c.isNotNull(f.col("city_longitude_tmp")), f.col("city_longitude_tmp")) \
             .otherwise(f.col("Longitude"))) \
       .drop("death_country_code", "city_region_tmp", "city_population_tmp", "city_latitude_tmp", "city_longitude_tmp", \
             "Country", "City", "AccentCity", "Region", "Population", "Latitude", "Longitude", "CityUpd")

# Some cities are not unique by "City Name" + "Country". Remove duplicates from the dataframe, use the first number of City ID
w = Window.partitionBy("id").orderBy(f.col("id"))
df = df.withColumn("row", f.row_number().over(w)).filter(f.col("row") == 1).drop("row")

In [32]:
# Check null values
df.where("death_city_name is not null and death_city_region is null").count()

0

### Organization City

Join the dataframe with mismatches df_tmp_mis_cities to the main dataframe

In [33]:
join_expr = f.trim(df["organization_city_name"]) == f.trim(df_tmp_mis_cities["mis_city_name"])
df = df.join(df_tmp_mis_cities, join_expr, "left_outer").drop("mis_city_name")

Join the dataframe Cities for mismatches by the columns in the dataframe df_tmp_mis_cities

In [34]:
join_expr = (f.trim(df["orig_city_name"]) == f.trim(df_cities_orig["AccentCity"])) \
            & (f.trim(df["orig_region"]) == f.trim(df_cities_orig["Region"])) \
            & (f.trim(df["orig_country"]) == f.trim(df_cities_orig["Country"]))

df = df.join(df_cities_orig, join_expr, "left_outer") \
       .withColumnRenamed("Region", "city_region_tmp") \
       .withColumnRenamed("Population", "city_population_tmp") \
       .withColumnRenamed("Latitude", "city_latitude_tmp") \
       .withColumnRenamed("Longitude", "city_longitude_tmp") \
       .drop("orig_city_name", "orig_country", "orig_region", "Country", "City", "AccentCity", "CityUpd")

Join the dataframe Cities for matches by the columns CityUpd and Country

In [35]:
join_expr = (f.trim(df["organization_city_name"]) == f.trim(df_cities_orig["CityUpd"])) \
            & (f.trim(df["organization_country_code"]) == f.trim(df_cities_orig["Country"]))

# City columns: if a value from the df_tmp_mis_cities is not null, then take it, else take a value from the df_cities_orig
df = df.join(df_cities_orig, join_expr, "left_outer") \
       .withColumn("organization_city_region", \
            f.when(c.isNotNull(f.col("city_region_tmp")), f.col("city_region_tmp")) \
             .otherwise(f.col("Region"))) \
       .withColumn("organization_city_population", \
            f.when(c.isNotNull(f.col("city_population_tmp")), f.col("city_population_tmp")) \
             .otherwise(f.col("Population"))) \
       .withColumn("organization_city_latitude", \
            f.when(c.isNotNull(f.col("city_latitude_tmp")), f.col("city_latitude_tmp")) \
             .otherwise(f.col("Latitude"))) \
       .withColumn("organization_city_longitude", \
            f.when(c.isNotNull(f.col("city_longitude_tmp")), f.col("city_longitude_tmp")) \
             .otherwise(f.col("Longitude"))) \
       .drop("organization_country_code", "city_region_tmp", "city_population_tmp", "city_latitude_tmp", "city_longitude_tmp", \
             "Country", "City", "AccentCity", "Region", "Population", "Latitude", "Longitude", "CityUpd")

# Some cities are not unique by "City Name" + "Country". Remove duplicates from the dataframe, use the first number of City ID
w = Window.partitionBy("id").orderBy(f.col("id"))
df = df.withColumn("row", f.row_number().over(w)).filter(f.col("row") == 1).drop("row")

In [36]:
# Check null values
df.where("organization_city_name is not null and organization_city_region is null").count()

0

In [37]:
# Check the count of records in the dataset
df.count()

969

## Create Organization columns
1 laureate can work in more than 1 organization. Review the max count of organizations, create a set of columns for this max count

In [38]:
df.groupBy("laureate_id").count().select(f.max("count")).show()

+----------+
|max(count)|
+----------+
|         3|
+----------+



In [39]:
# Count of unique laureates
df.select("laureate_id").distinct().count()

904

In the final dataset, the laureate_id field must be unique so that the subsequent data analysis is more accurate (i.e., information about each laureate was included in the statistics 1 time, not 2 or 3). The dataset should have 904 records.

Max count of organizations for 1 laureate = 3. Add the row number of organization for every laureate.

In [40]:
w = Window.partitionBy("laureate_id").orderBy(f.col("laureate_id"))
df = df.withColumn("row_org", f.row_number().over(w))

Create 3 sets of organization's columns.

In [41]:
df = df.withColumn("organization_name1", \
            f.when(f.col("row_org") == 1, f.col("organization_name")).otherwise(f.lit(None))) \
       .withColumn("organization_name2", \
            f.when(f.col("row_org") == 2, f.col("organization_name")).otherwise(f.lit(None))) \
       .withColumn("organization_name3", \
            f.when(f.col("row_org") == 3, f.col("organization_name")).otherwise(f.lit(None))) \
       .withColumn("organization_city_name1", \
            f.when(f.col("row_org") == 1, f.col("organization_city_name")).otherwise(f.lit(None))) \
       .withColumn("organization_city_name2", \
            f.when(f.col("row_org") == 2, f.col("organization_city_name")).otherwise(f.lit(None))) \
       .withColumn("organization_city_name3", \
            f.when(f.col("row_org") == 3, f.col("organization_city_name")).otherwise(f.lit(None))) \
       .withColumn("organization_country_name1", \
            f.when(f.col("row_org") == 1, f.col("organization_country_name")).otherwise(f.lit(None))) \
       .withColumn("organization_country_name2", \
            f.when(f.col("row_org") == 2, f.col("organization_country_name")).otherwise(f.lit(None))) \
       .withColumn("organization_country_name3", \
            f.when(f.col("row_org") == 3, f.col("organization_country_name")).otherwise(f.lit(None))) \
       .withColumn("organization_country_parent_name1", \
            f.when(f.col("row_org") == 1, f.col("organization_country_parent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_country_parent_name2", \
            f.when(f.col("row_org") == 2, f.col("organization_country_parent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_country_parent_name3", \
            f.when(f.col("row_org") == 3, f.col("organization_country_parent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_parent_name1", \
            f.when(f.col("row_org") == 1, f.col("organization_parent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_parent_name2", \
            f.when(f.col("row_org") == 2, f.col("organization_parent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_parent_name3", \
            f.when(f.col("row_org") == 3, f.col("organization_parent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_region_name1", \
            f.when(f.col("row_org") == 1, f.col("organization_region_name")).otherwise(f.lit(None))) \
       .withColumn("organization_region_name2", \
            f.when(f.col("row_org") == 2, f.col("organization_region_name")).otherwise(f.lit(None))) \
       .withColumn("organization_region_name3", \
            f.when(f.col("row_org") == 3, f.col("organization_region_name")).otherwise(f.lit(None))) \
       .withColumn("organization_country_population1", \
            f.when(f.col("row_org") == 1, f.col("organization_country_population")).otherwise(f.lit(None))) \
       .withColumn("organization_country_population2", \
            f.when(f.col("row_org") == 2, f.col("organization_country_population")).otherwise(f.lit(None))) \
       .withColumn("organization_country_population3", \
            f.when(f.col("row_org") == 3, f.col("organization_country_population")).otherwise(f.lit(None))) \
       .withColumn("organization_country_gdb1", \
            f.when(f.col("row_org") == 1, f.col("organization_country_gdb")).otherwise(f.lit(None))) \
       .withColumn("organization_country_gdb2", \
            f.when(f.col("row_org") == 2, f.col("organization_country_gdb")).otherwise(f.lit(None))) \
       .withColumn("organization_country_gdb3", \
            f.when(f.col("row_org") == 3, f.col("organization_country_gdb")).otherwise(f.lit(None))) \
       .withColumn("organization_continent_name1", \
            f.when(f.col("row_org") == 1, f.col("organization_continent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_continent_name2", \
            f.when(f.col("row_org") == 2, f.col("organization_continent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_continent_name3", \
            f.when(f.col("row_org") == 3, f.col("organization_continent_name")).otherwise(f.lit(None))) \
       .withColumn("organization_city_region1", \
            f.when(f.col("row_org") == 1, f.col("organization_city_region")).otherwise(f.lit(None))) \
       .withColumn("organization_city_region2", \
            f.when(f.col("row_org") == 2, f.col("organization_city_region")).otherwise(f.lit(None))) \
       .withColumn("organization_city_region3", \
            f.when(f.col("row_org") == 3, f.col("organization_city_region")).otherwise(f.lit(None))) \
       .withColumn("organization_city_population1", \
            f.when(f.col("row_org") == 1, f.col("organization_city_population")).otherwise(f.lit(None))) \
       .withColumn("organization_city_population2", \
            f.when(f.col("row_org") == 2, f.col("organization_city_population")).otherwise(f.lit(None))) \
       .withColumn("organization_city_population3", \
            f.when(f.col("row_org") == 3, f.col("organization_city_population")).otherwise(f.lit(None))) \
       .withColumn("organization_city_latitude1", \
            f.when(f.col("row_org") == 1, f.col("organization_city_latitude")).otherwise(f.lit(None))) \
       .withColumn("organization_city_latitude2", \
            f.when(f.col("row_org") == 2, f.col("organization_city_latitude")).otherwise(f.lit(None))) \
       .withColumn("organization_city_latitude3", \
            f.when(f.col("row_org") == 3, f.col("organization_city_latitude")).otherwise(f.lit(None))) \
       .withColumn("organization_city_longitude1", \
            f.when(f.col("row_org") == 1, f.col("organization_city_longitude")).otherwise(f.lit(None))) \
       .withColumn("organization_city_longitude2", \
            f.when(f.col("row_org") == 2, f.col("organization_city_longitude")).otherwise(f.lit(None))) \
       .withColumn("organization_city_longitude3", \
            f.when(f.col("row_org") == 3, f.col("organization_city_longitude")).otherwise(f.lit(None))) \
       .drop("organization_name", "organization_city_name", "organization_country_name", "organization_country_parent_name", \
             "organization_parent_name", "organization_region_name", "organization_country_population", \
             "organization_country_gdb", "organization_continent_name", "organization_city_region", \
             "organization_city_population", "organization_city_latitude", "organization_city_longitude", "id", "row_org")      

Group values by laureate_id. If organization 1 column is not null, take values of 1 set of organization columns. If organization 2 column is not null, take values of 2 set of organization columns. Else take values of 3 set of organization columns.

In [42]:
df = df.groupBy("laureate_id").agg(
    f.min('year').alias('year'), \
    f.min('category').alias('category'), \
    f.min('prize').alias('prize'), \
    f.min('prize_share').alias('prize_share'), \
    f.min('laureate_type').alias('laureate_type'), \
    f.min('motivation').alias('motivation'), \
    f.min('laureate_person_name').alias('laureate_person_name'), \
    f.min('gender').alias('gender'), \
    f.min('society_name').alias('society_name'), \
    f.min('birth_date').alias('birth_date'), \
    f.min('birth_city_name').alias('birth_city_name'), \
    f.min('birth_city_region').alias('birth_city_region'), \
    f.min('birth_city_population').alias('birth_city_population'), \
    f.min('birth_city_latitude').alias('birth_city_latitude'), \
    f.min('birth_city_longitude').alias('birth_city_longitude'), \
    f.min('birth_country_name').alias('birth_country_name'), \
    f.min('birth_country_parent_name').alias('birth_country_parent_name'), \
    f.min('birth_region_name').alias('birth_region_name'), \
    f.min('birth_continent_name').alias('birth_continent_name'), \
    f.min('birth_country_population').alias('birth_country_population'), \
    f.min('birth_country_area_sq_miles').alias('birth_country_area_sq_miles'), \
    f.min('birth_country_pop_dencity_per_sq_mile').alias('birth_country_pop_dencity_per_sq_mile'), \
    f.min('birth_country_coastline').alias('birth_country_coastline'), \
    f.min('birth_country_net_migration').alias('birth_country_net_migration'), \
    f.min('birth_country_infant_mortality_per_1000').alias('birth_country_infant_mortality_per_1000'), \
    f.min('birth_country_gdb').alias('birth_country_gdb'), \
    f.min('birth_country_percent_literacy').alias('birth_country_percent_literacy'), \
    f.min('birth_country_phones_per_1000').alias('birth_country_phones_per_1000'), \
    f.min('birth_country_percent_arable').alias('birth_country_percent_arable'), \
    f.min('birth_country_percent_crops').alias('birth_country_percent_crops'), \
    f.min('birth_country_percent_other').alias('birth_country_percent_other'), \
    f.min('birth_country_climate').alias('birth_country_climate'), \
    f.min('birth_country_birthrate').alias('birth_country_birthrate'), \
    f.min('birth_country_deathrate').alias('birth_country_deathrate'), \
    f.min('birth_country_agriculture').alias('birth_country_agriculture'), \
    f.min('birth_country_industry').alias('birth_country_industry'), \
    f.min('birth_country_service').alias('birth_country_service'), \
    f.min('death_date').alias('death_date'), \
    f.min('death_city_name').alias('death_city_name'), \
    f.min('death_city_region').alias('death_city_region'), \
    f.min('death_city_population').alias('death_city_population'), \
    f.min('death_city_latitude').alias('death_city_latitude'), \
    f.min('death_city_longitude').alias('death_city_longitude'), \
    f.min('death_country_name').alias('death_country_name'), \
    f.min('death_country_parent_name').alias('death_country_parent_name'), \
    f.min('death_region_name').alias('death_region_name'), \
    f.min('death_continent_name').alias('death_continent_name'), \
    f.min('death_country_population').alias('death_country_population'), \
    f.min('death_country_gdb').alias('death_country_gdb'), \
    f.min('death_country_coastline').alias('death_country_coastline'), \
    f.min('death_country_climate').alias('death_country_climate'), \
    f.min('organization_name1').alias('organization_name1'), \
    f.min('organization_parent_name1').alias('organization_parent_name1'), \
    f.min('organization_city_name1').alias('organization_city_name1'), \
    f.min('organization_city_region1').alias('organization_city_region1'), \
    f.min('organization_city_population1').alias('organization_city_population1'), \
    f.min('organization_city_latitude1').alias('organization_city_latitude1'), \
    f.min('organization_city_longitude1').alias('organization_city_longitude1'), \
    f.min('organization_country_name1').alias('organization_country_name1'), \
    f.min('organization_country_parent_name1').alias('organization_country_parent_name1'), \
    f.min('organization_region_name1').alias('organization_region_name1'), \
    f.min('organization_continent_name1').alias('organization_continent_name1'), \
    f.min('organization_country_population1').alias('organization_country_population1'), \
    f.min('organization_country_gdb1').alias('organization_country_gdb1'), \
    f.min('organization_name2').alias('organization_name2'), \
    f.min('organization_parent_name2').alias('organization_parent_name2'), \
    f.min('organization_city_name2').alias('organization_city_name2'), \
    f.min('organization_city_region2').alias('organization_city_region2'), \
    f.min('organization_city_population2').alias('organization_city_population2'), \
    f.min('organization_city_latitude2').alias('organization_city_latitude2'), \
    f.min('organization_city_longitude2').alias('organization_city_longitude2'), \
    f.min('organization_country_name2').alias('organization_country_name2'), \
    f.min('organization_country_parent_name2').alias('organization_country_parent_name2'), \
    f.min('organization_region_name2').alias('organization_region_name2'), \
    f.min('organization_continent_name2').alias('organization_continent_name2'), \
    f.min('organization_country_population2').alias('organization_country_population2'), \
    f.min('organization_country_gdb2').alias('organization_country_gdb2'), \
    f.min('organization_name3').alias('organization_name3'), \
    f.min('organization_parent_name3').alias('organization_parent_name3'), \
    f.min('organization_city_name3').alias('organization_city_name3'), \
    f.min('organization_city_region3').alias('organization_city_region3'), \
    f.min('organization_city_population3').alias('organization_city_population3'), \
    f.min('organization_city_latitude3').alias('organization_city_latitude3'), \
    f.min('organization_city_longitude3').alias('organization_city_longitude3'), \
    f.min('organization_country_name3').alias('organization_country_name3'), \
    f.min('organization_country_parent_name3').alias('organization_country_parent_name3'), \
    f.min('organization_region_name3').alias('organization_region_name3'), \
    f.min('organization_continent_name3').alias('organization_continent_name3'), \
    f.min('organization_country_population3').alias('organization_country_population3'), \
    f.min('organization_country_gdb3').alias('organization_country_gdb3'))

In [43]:
# Check Organization columns for a laureate with 3 organizations
df.where("laureate_id = 837").select("laureate_id", "organization_name1", "organization_name2", "organization_name3").show()

+-----------+--------------------+--------------------+--------------------+
|laureate_id|  organization_name1|  organization_name2|  organization_name3|
+-----------+--------------------+--------------------+--------------------+
|        837|Harvard Medical S...|Massachusetts Gen...|Howard Hughes Med...|
+-----------+--------------------+--------------------+--------------------+



In [44]:
# Check the count of records in the dataset
df.count()

904

Dataset is ready.

# Check the dataset

In [45]:
# Structure
df.printSchema()

root
 |-- laureate_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- prize: string (nullable = true)
 |-- prize_share: string (nullable = true)
 |-- laureate_type: string (nullable = true)
 |-- motivation: string (nullable = true)
 |-- laureate_person_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- society_name: string (nullable = true)
 |-- birth_date: string (nullable = true)
 |-- birth_city_name: string (nullable = true)
 |-- birth_city_region: string (nullable = true)
 |-- birth_city_population: double (nullable = true)
 |-- birth_city_latitude: double (nullable = true)
 |-- birth_city_longitude: double (nullable = true)
 |-- birth_country_name: string (nullable = true)
 |-- birth_country_parent_name: string (nullable = true)
 |-- birth_region_name: string (nullable = true)
 |-- birth_continent_name: string (nullable = true)
 |-- birth_country_population: integer (nullable = true)
 |-- birth_cou

In [46]:
df.select("laureate_id", "year", "category", "prize", "laureate_type", "motivation").sample(0.005).show()

+-----------+----+---------+--------------------+-------------+--------------------+
|laureate_id|year| category|               prize|laureate_type|          motivation|
+-----------+----+---------+--------------------+-------------+--------------------+
|        112|1978|  Physics|The Nobel Prize i...|   Individual|for their discove...|
|        688|1976|Economics|The Sveriges Riks...|   Individual|for his achieveme...|
|        758|2002|Chemistry|The Nobel Prize i...|   Individual|for his developme...|
|        896|2013|Economics|The Sveriges Riks...|   Individual|for their empiric...|
|        919|2015|  Physics|The Nobel Prize i...|   Individual|for the discovery...|
+-----------+----+---------+--------------------+-------------+--------------------+



In [47]:
df.select("laureate_id", "laureate_type", "laureate_person_name", "gender", "society_name").sample(0.005).show()

+-----------+-------------+--------------------+------+------------+
|laureate_id|laureate_type|laureate_person_name|gender|society_name|
+-----------+-------------+--------------------+------+------------+
|        120|   Individual|     Kai M. Siegbahn|  Male|        null|
|        203|   Individual|Artturi Ilmari Vi...|  Male|        null|
|        271|   Individual|        Robert Huber|  Male|        null|
|        456|   Individual| Rolf M. Zinkernagel|  Male|        null|
|        538|   Individual|Mohamed Anwar al-...|  Male|        null|
|        562|   Individual|Carlos Filipe Xim...|  Male|        null|
|        811|   Individual|   Mario R. Capecchi|  Male|        null|
+-----------+-------------+--------------------+------+------------+



In [48]:
df.where("laureate_type = 'Individual' and birth_date is not null") \
  .select("laureate_id", "birth_date", "birth_city_name", "birth_city_region", "birth_city_population", \
          "birth_city_latitude", "birth_city_longitude") \
  .sample(0.01).show()

+-----------+----------+----------------+-----------------+---------------------+-------------------+--------------------+
|laureate_id|birth_date| birth_city_name|birth_city_region|birth_city_population|birth_city_latitude|birth_city_longitude|
+-----------+----------+----------------+-----------------+---------------------+-------------------+--------------------+
|         14|1850-06-06|           Fulda|               05|              63851.0|              50.55|            9.666667|
|        105|1931-03-22|    Brooklyn, NY|               NY|                 null|              40.65|              -73.95|
|        160|1852-08-30|       Rotterdam|               11|             603851.0|          51.916667|                 4.5|
|        179|1877-09-02|      Eastbourne|               E2|             112906.0|               50.8|                0.25|
|        237|1918-09-08|       Gravesend|               G5|              54264.0| 51.433333000000005|            0.366667|
|        352|189

In [49]:
df.where("laureate_type = 'Individual' and birth_country_name is not null") \
  .select("laureate_id", "birth_country_name", "birth_country_parent_name", "birth_region_name", "birth_continent_name") \
  .sample(0.01).show()

+-----------+--------------------+-------------------------+--------------------+--------------------+
|laureate_id|  birth_country_name|birth_country_parent_name|   birth_region_name|birth_continent_name|
+-----------+--------------------+-------------------------+--------------------+--------------------+
|        214|      United Kingdom|           United Kingdom|WESTERN EUROPE   ...|              EUROPE|
|        227|      United Kingdom|           United Kingdom|WESTERN EUROPE   ...|              EUROPE|
|        265|              Taiwan|                   Taiwan|ASIA (EX. NEAR EA...|                ASIA|
|        295|Faroe Islands (De...|                  Denmark|WESTERN EUROPE   ...|              EUROPE|
|        296|              Russia|                   Russia|C.W. OF IND. STATES |              EUROPE|
|        421|United States of ...|     United States of ...|NORTHERN AMERICA ...|       NORTH AMERICA|
|        427|      United Kingdom|           United Kingdom|WESTERN EUROP

In [50]:
df.where("laureate_type = 'Individual' and death_date is not null") \
  .select("laureate_id", "death_date", "death_city_name", "death_city_region", "death_city_population", \
          "death_city_latitude", "death_city_longitude") \
  .sample(0.01).show()

+-----------+----------+-------------------+-----------------+---------------------+-------------------+--------------------+
|laureate_id|death_date|    death_city_name|death_city_region|death_city_population|death_city_latitude|death_city_longitude|
+-----------+----------+-------------------+-----------------+---------------------+-------------------+--------------------+
|         44|1958-02-01|Charlottesville, VA|               VA|              34703.0| 38.029166700000005|  -78.47694440000001|
|        218|1978-12-11|   White Plains, NY|               NY|              57260.0| 41.033888899999994|         -73.7633333|
|        244|2007-07-23|             Munich|               02|            1246133.0|              48.15|             11.5833|
|        363|1992-04-08|               Rome|               07|            2643736.0|               41.9|           12.483333|
|        405|2008-10-07|        Del Mar, CA|               CA|                 null| 32.959444399999995| -117.26444440

In [51]:
df.where("laureate_type = 'Individual' and death_country_name is not null") \
  .select("laureate_id", "death_country_name", "death_country_parent_name", "death_region_name", "death_continent_name") \
  .sample(0.01).show()

+-----------+--------------------+-------------------------+--------------------+--------------------+
|laureate_id|  death_country_name|death_country_parent_name|   death_region_name|death_continent_name|
+-----------+--------------------+-------------------------+--------------------+--------------------+
|          1|             Germany|                  Germany|WESTERN EUROPE   ...|              EUROPE|
|         41|      United Kingdom|           United Kingdom|WESTERN EUROPE   ...|              EUROPE|
|        137|             Germany|                  Germany|WESTERN EUROPE   ...|              EUROPE|
|        201|West Germany (Ger...|                  Germany|WESTERN EUROPE   ...|              EUROPE|
|        208|              Sweden|                   Sweden|WESTERN EUROPE   ...|              EUROPE|
|        442|United States of ...|     United States of ...|NORTHERN AMERICA ...|       NORTH AMERICA|
|        519|        South Africa|             South Africa|SUB-SAHARAN A

In [52]:
df.where("laureate_type = 'Individual' and organization_name2 is not null") \
  .select("laureate_id", "organization_name1", "organization_region_name1", "organization_name2", \
          "organization_name2", "organization_city_population2", "organization_name3", "organization_continent_name3") \
  .sample(0.15).show()

+-----------+--------------------+-------------------------+--------------------+--------------------+-----------------------------+------------------+----------------------------+
|laureate_id|  organization_name1|organization_region_name1|  organization_name2|  organization_name2|organization_city_population2|organization_name3|organization_continent_name3|
+-----------+--------------------+-------------------------+--------------------+--------------------+-----------------------------+------------------+----------------------------+
|        189|University of Hei...|     WESTERN EUROPE   ...|I.G. Farbenindust...|I.G. Farbenindust...|                     144703.0|              null|                        null|
|        216|University of Fre...|     WESTERN EUROPE   ...|Staatliches Insti...|Staatliches Insti...|                         null|              null|                        null|
|        789|NHMRC Helicobacte...|     OCEANIA          ...|University of Wes...|University of 

In [None]:
# Save to Hive
# df.write.format("parquet").mode("overwrite").saveAsTable("whdb.dataset")

In [53]:
sp.stop()