In [15]:
## Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.functions import col, avg, max, min, countDistinct, sum, round
import geopandas as gpd

## Set up PySpark environment
from pyspark.sql import SparkSession

In [2]:
## Initialize the SparkSession
# appName is the name of the application
# getOrCreate() creates a new session or retrieves an existing one
spark = SparkSession.builder \
    .appName("PySpark project") \
        .getOrCreate()

## Verify that SparkSession is created
spark

25/04/18 10:23:41 WARN Utils: Your hostname, Zipcoders-MacBook-Pro-5.local resolves to a loopback address: 127.0.0.1; using 192.168.3.155 instead (on interface en0)
25/04/18 10:23:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/18 10:23:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Look at Population by State

In [7]:
try:
    zip_df = spark.read.csv ('../data/zip_code_demographics.csv', header=True, inferSchema=True)
    print('Data loaded successfully')
except Exception as e:
    print(f'Error loading data: {e}')

Data loaded successfully


In [9]:
zip_df.show()

+----+--------+---------+---------------+--------+-------------+----------+-------+-----------+------+------------------+-------------------+--------------------+----------------+------------------+---------------------+-------------------+-----------------+
| zip|     lat|      lng|           city|state_id|   state_name|population|density|county_name|po_box|      dist_highway|dist2_large_airport|dist2_medium_airport|   dist_to_shore|number_of_business|adjusted_gross_income|total_income_amount|number_of_returns|
+----+--------+---------+---------------+--------+-------------+----------+-------+-----------+------+------------------+-------------------+--------------------+----------------+------------------+---------------------+-------------------+-----------------+
|1001|42.06262|-72.62521|         Agawam|      MA|Massachusetts|     16088|  550.1|    Hampden|     0|  1.38703489958776|   106.145765467407|    12.9462123500772|93.0492507357679|               438|               598807|   

In [10]:
zip_df.printSchema()

root
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- city: string (nullable = true)
 |-- state_id: string (nullable = true)
 |-- state_name: string (nullable = true)
 |-- population: integer (nullable = true)
 |-- density: double (nullable = true)
 |-- county_name: string (nullable = true)
 |-- po_box: integer (nullable = true)
 |-- dist_highway: double (nullable = true)
 |-- dist2_large_airport: double (nullable = true)
 |-- dist2_medium_airport: double (nullable = true)
 |-- dist_to_shore: double (nullable = true)
 |-- number_of_business: integer (nullable = true)
 |-- adjusted_gross_income: integer (nullable = true)
 |-- total_income_amount: double (nullable = true)
 |-- number_of_returns: integer (nullable = true)



In [20]:
zip_pop_df = zip_df.select('state_id','state_name','population').groupBy('state_id','state_name').agg(sum('population').alias("total_population"))
zip_pop_df.show()

+--------+--------------------+----------------+
|state_id|          state_name|total_population|
+--------+--------------------+----------------+
|      MT|             Montana|         1511394|
|      NC|      North Carolina|        12419438|
|      MD|            Maryland|         7224056|
|      CO|            Colorado|         6617159|
|      CT|         Connecticut|         3819343|
|      IL|            Illinois|        13819331|
|      WY|             Wyoming|         1182453|
|      NJ|          New Jersey|         9823423|
|      DE|            Delaware|         1372445|
|      DC|District of Columbia|         1207283|
|      TN|           Tennessee|         7388286|
|      LA|           Louisiana|         5596850|
|      AR|            Arkansas|         3672387|
|      CA|          California|        44183935|
|      NM|          New Mexico|         3150812|
|      UT|                Utah|         3783545|
|      MI|            Michigan|        10768003|
|      NY|          

In [16]:
zip_pop_pd = zip_pop_df.toPandas()