# Sample Queries

In [1]:
import pyspark.sql.functions as F
import etl

## Run Demographic Pipeline

In [3]:
dem_etl = etl.DemographicsPipeline()
dem_etl.spark.sparkContext.setLogLevel("ERROR")
dem = dem_etl.run()
dem.printSchema()



root
 |-- state: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- num_cities: long (nullable = false)
 |-- total_pop: double (nullable = true)
 |-- amind_pop: long (nullable = true)
 |-- asian_pop: long (nullable = true)
 |-- afram_pop: long (nullable = true)
 |-- hispl_pop: long (nullable = true)
 |-- white_pop: long (nullable = true)





## Run Immigration Pipeline

In [4]:
imm_etl = etl.ImmigrationPipeline()
imm_etl.spark.sparkContext.setLogLevel("ERROR")
imm = imm_etl.run()
imm.printSchema()



root
 |-- cicid: integer (nullable = true)
 |-- i94yr: integer (nullable = true)
 |-- i94mon: integer (nullable = true)
 |-- i94cit: string (nullable = true)
 |-- i94res: string (nullable = true)
 |-- arrdate: date (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: date (nullable = true)
 |-- dtadfile: date (nullable = true)
 |-- i94bir: integer (nullable = true)
 |-- biryear: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- count: integer (nullable = true)
 |-- dtaddto: date (nullable = true)
 |-- i94visa: string (nullable = true)
 |-- visatype: string (nullable = true)
 |-- admnum: integer (nullable = true)
 |-- state: string (nullable = true)





## Number of visitors per state

In [16]:
state_visa_agg = (
    imm
    .groupby("state", "i94addr")
    .agg(
        F.count("i94visa").alias("i94visa_count")
    )
    .orderBy(F.col("state"))
)

state_visa_agg.show(20)



+-----------------+-------+-------------+
|            state|i94addr|i94visa_count|
+-----------------+-------+-------------+
|          ALABAMA|     AL|        56299|
|           ALASKA|     AK|        58125|
|          ARIZONA|     AZ|       181216|
|         ARKANSAS|     AR|        25277|
|  All Other Codes|     99|          769|
|       CALIFORNIA|     CA|      4565383|
|         COLORADO|     CO|       220811|
|      CONNECTICUT|     CT|       125888|
|         DELAWARE|     DE|        27365|
|DIST. OF COLUMBIA|     DC|       253725|
|          FLORIDA|     FL|      6522473|
|          GEORGIA|     GA|       304366|
|             GUAM|     GU|       296626|
|           HAWAII|     HI|      1759009|
|            IDAHO|     ID|        20889|
|         ILLINOIS|     IL|       739628|
|          INDIANA|     IN|        87127|
|             IOWA|     IA|        30544|
|           KANSAS|     KS|        26259|
|         KENTUCKY|     KY|        42094|
+-----------------+-------+-------



## Join demographics and visa entries per state

In [17]:
pop_agg = (
    dem
    .join(
        state_visa_agg.drop("state"), 
        on=[dem["state_code"] == state_visa_agg["i94addr"]]
    )
    .drop("i94addr")
)

pop_agg.show(20, False)



+--------------------+----------+----------+----------+---------+---------+---------+---------+---------+-------------+
|state               |state_code|num_cities|total_pop |amind_pop|asian_pop|afram_pop|hispl_pop|white_pop|i94visa_count|
+--------------------+----------+----------+----------+---------+---------+---------+---------+---------+-------------+
|Arizona             |AZ        |16        |4499542.0 |129708   |229183   |296222   |1508157  |3591611  |181216       |
|South Carolina      |SC        |5         |533657.0  |3705     |13355    |175064   |29863    |343764   |62319        |
|Louisiana           |LA        |8         |1300595.0 |8263     |38739    |602377   |87133    |654578   |160390       |
|Minnesota           |MN        |11        |1422403.0 |25242    |151544   |216731   |103229   |1050239  |112664       |
|New Jersey          |NJ        |12        |1428908.0 |11350    |116844   |452202   |600437   |615083   |710944       |
|District of Columbia|DC        |1      

