In [13]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
# pyspark packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

In [44]:
spark = SparkSession.builder \
    .appName("MIS548 Project") \
    .config("spark.sql.debug.maxToStringFields", "1000") \
    .getOrCreate()

spark

In [46]:
ip_data = spark.read.option("delimiter", ",") \
                .option("header", True) \
                .csv("./data/gun-violence-data_01-2013_03-2018.csv")

In [48]:
ip_data.printSchema()

root
 |-- incident_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city_or_county: string (nullable = true)
 |-- address: string (nullable = true)
 |-- n_killed: string (nullable = true)
 |-- n_injured: string (nullable = true)
 |-- incident_url: string (nullable = true)
 |-- source_url: string (nullable = true)
 |-- incident_url_fields_missing: string (nullable = true)
 |-- congressional_district: string (nullable = true)
 |-- gun_stolen: string (nullable = true)
 |-- gun_type: string (nullable = true)
 |-- incident_characteristics: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- location_description: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- n_guns_involved: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- participant_age: string (nullable = true)
 |-- participant_age_group: string (nullable = true)
 |-- participant_gender: string (nullable = true)
 |-- part

In [52]:
null_counts = ip_data.select([sum(col(c).isNull().cast('int')).alias(c) for c in ip_data.columns])

In [62]:
null_counts.show(truncate=False)

+-----------+----+-----+--------------+-------+--------+---------+------------+----------+---------------------------+----------------------+----------+--------+------------------------+--------+--------------------+---------+---------------+-----+---------------+---------------------+------------------+----------------+------------------------+------------------+----------------+-------+--------------------+---------------------+
|incident_id|date|state|city_or_county|address|n_killed|n_injured|incident_url|source_url|incident_url_fields_missing|congressional_district|gun_stolen|gun_type|incident_characteristics|latitude|location_description|longitude|n_guns_involved|notes|participant_age|participant_age_group|participant_gender|participant_name|participant_relationship|participant_status|participant_type|sources|state_house_district|state_senate_district|
+-----------+----+-----+--------------+-------+--------+---------+------------+----------+---------------------------+------------

In [102]:
print(f"Number of records in the data : {ip_data.count()}")
print(f"Number of columns: {len(ip_data.columns)}")

Number of records in the data : 246939
Number of columns: 29


In [90]:
narrow_null_counts = null_counts.selectExpr(f"'{null_counts.columns[0]}' as column_name",
                                            f"{null_counts.columns[0]} as null_count")

In [92]:
for c in null_counts.columns[1:]:
    next_col = null_counts.selectExpr(f"'{c}' as column_name", f"{c} as null_count")
    narrow_null_counts = narrow_null_counts.union(next_col)

In [104]:
narrow_null_counts.show(n=29, truncate=False)

[Stage 417:>                                                        (0 + 8) / 8]

+---------------------------+----------+
|column_name                |null_count|
+---------------------------+----------+
|incident_id                |0         |
|date                       |236       |
|state                      |2552      |
|city_or_county             |1274      |
|address                    |17717     |
|n_killed                   |3714      |
|n_injured                  |6142      |
|incident_url               |935       |
|source_url                 |1292      |
|incident_url_fields_missing|369       |
|congressional_district     |13125     |
|gun_stolen                 |100965    |
|gun_type                   |106690    |
|incident_characteristics   |7584      |
|latitude                   |15184     |
|location_description       |204849    |
|longitude                  |15185     |
|n_guns_involved            |106713    |
|notes                      |88279     |
|participant_age            |104130    |
|participant_age_group      |55382     |
|participant_gen

                                                                                