#### Importing Libraries

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# pyspark packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, desc

#### Setting Spark Session and Loading Data

In [6]:
spark = SparkSession.builder \
    .appName("MIS548 Project") \
    .config("spark.sql.debug.maxToStringFields", "1000") \
    .getOrCreate()

spark

24/10/12 18:03:51 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
ip_data = spark.read.option("delimiter", ",") \
                .option("header", True) \
                .csv("../data/gun-violence-data_01-2013_03-2018.csv")

print(f"Number of records in the data : {ip_data.count()}")
print(f"Number of columns: {len(ip_data.columns)}")

Number of records in the data : 246939
Number of columns: 29


In [8]:
ip_data.printSchema()

root
 |-- incident_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city_or_county: string (nullable = true)
 |-- address: string (nullable = true)
 |-- n_killed: string (nullable = true)
 |-- n_injured: string (nullable = true)
 |-- incident_url: string (nullable = true)
 |-- source_url: string (nullable = true)
 |-- incident_url_fields_missing: string (nullable = true)
 |-- congressional_district: string (nullable = true)
 |-- gun_stolen: string (nullable = true)
 |-- gun_type: string (nullable = true)
 |-- incident_characteristics: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- location_description: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- n_guns_involved: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- participant_age: string (nullable = true)
 |-- participant_age_group: string (nullable = true)
 |-- participant_gender: string (nullable = true)
 |-- part

**TO DO:**

All of the data types of the columns seems to be loaded as `string`, so we may need to convert the data types of our columns while reading the file.

#### Missing Values and Duplicate Data Check

In [11]:
def get_null_counts(df):
    total_rows = df.count()
    
    null_counts = df.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns])

    narrow_null_counts = null_counts.selectExpr(
                                    f"'{null_counts.columns[0]}' as column_name",
                                    f"{null_counts.columns[0]} as null_count",
                                    f"({null_counts.columns[0]} / {total_rows} * 100) as null_percentage")

    for c in null_counts.columns[1:]:
        next_col = null_counts.selectExpr(f"'{c}' as column_name", 
                                          f"{c} as null_count",
                                          f"({c} / {total_rows} * 100) as null_percentage")
        narrow_null_counts = narrow_null_counts.union(next_col)
    
    narrow_null_counts = narrow_null_counts.orderBy(desc("null_count"))
    
    return narrow_null_counts

In [12]:
narrow_null_counts = get_null_counts(ip_data)
narrow_null_counts.show(n=29, truncate=False)

                                                                                

+---------------------------+----------+-------------------+
|column_name                |null_count|null_percentage    |
+---------------------------+----------+-------------------+
|participant_relationship   |231691    |93.82519569610308  |
|location_description       |204849    |82.95530475137585  |
|participant_name           |133328    |53.99228149462013  |
|n_guns_involved            |106713    |43.21431608615893  |
|gun_type                   |106690    |43.205002045039464 |
|participant_age            |104130    |42.168308772611866 |
|gun_stolen                 |100965    |40.88661572291132  |
|notes                      |88279     |35.7493146080611   |
|participant_age_group      |55382     |22.42740109905685  |
|state_house_district       |52070     |21.08617917785364  |
|participant_gender         |49778     |20.1580147323833   |
|state_senate_district      |45852     |18.56814840912128  |
|participant_status         |41347     |16.743811224634424 |
|participant_type       

**Inference:**
Almost all the columns are having missing values. Some with signficant amount of missing values such as `participant_relationship`, `location_description`, `participant_name`, `n_guns_involved`, `gun_type`, `participant_age`, `gun_stolen`. From the above mentioned columns we might not use any of them for analysis except for `participant_age`, `n_guns_involved`, `gun_type`. So we might need to drop these columns instead of handling this missing data.

For the rest of the missing data we might need to handle those missing data.

**TO DO:**
1) We might need to drop the rows with missing information for the key columns such as `date`, `city_or_county`, `state` to allow for authenticity of the data. Instead of imputing the data.
2) We might impute for some columns with missing data and mention it clearly.
3) Or we might actually replace NA values with some key word such as `MISSING` or `UNKNOWN` for categorical data and `0` or other numeric values for the numerical data.


In [14]:
def check_duplicates_except(df, column_to_exclude="incident_id"):
    columns_to_check = [col for col in df.columns if col != column_to_exclude]
    
    df_duplicates = df.groupBy(columns_to_check).count().filter("count > 1")
    
    return df_duplicates

In [15]:
ip_data_dup_chk = check_duplicates_except(ip_data)

print(f"Number of Duplicate Rows: {ip_data_dup_chk.count()}")
print(f"Frequency of the Duplicated Row: {ip_data_dup_chk.select('count').first()[0]}")

                                                                                

Number of Duplicate Rows: 1


                                                                                

Frequency of the Duplicated Row: 108


**TO DO:**

The duplicate row is nothing but containing all `NULL` values in all columns.

#### Categorical Columns Analysis

By having an initial look at the data we can identify what are the categorical data columns and columns with string values we are having.

And we won't be analyzing few columns as they do not add much insights and we might drop those columns too.
The columns which we won't be using are:
`incident_id`, `incident_url`, `source_url`, `incident_url_fields_missing`, `location_description`, `participant_name`, `participant_relationship`, `sources`.

In [18]:
str_cols = ["state", "city_or_county", "address", "congressional_district", "gun_stolen",
            "gun_type", "incident_characteristics", "participant_age", "participant_age_group", 
            "participant_gender", "participant_status", "participant_type", "state_house_district", 
            "state_senate_district"]

In [19]:
def get_unq_val_cnts(df):
    for col in df.columns:
        unq_val = df.select(col).distinct().count()
        print(f"Total unique values in the column {col} : {unq_val}")

def get_most_freq(df):
    for col in df.columns:
        print(f"Top 5 frequent values for {col}:")
        df.groupBy(col).count().orderBy('count', ascending=False).show(5)

In [20]:
cat_ip_data = ip_data.select(*str_cols)
get_unq_val_cnts(cat_ip_data)

                                                                                

Total unique values in the column state : 1534


                                                                                

Total unique values in the column city_or_county : 13090
Total unique values in the column address : 198561
Total unique values in the column congressional_district : 222
Total unique values in the column gun_stolen : 428


                                                                                

Total unique values in the column gun_type : 2521


                                                                                

Total unique values in the column incident_characteristics : 18132
Total unique values in the column participant_age : 18602


                                                                                

Total unique values in the column participant_age_group : 1018
Total unique values in the column participant_gender : 932
Total unique values in the column participant_status : 2164
Total unique values in the column participant_type : 328
Total unique values in the column state_house_district : 431
Total unique values in the column state_senate_district : 159


In [21]:
get_most_freq(cat_ip_data)

Top 5 frequent values for state:
+----------+-----+
|     state|count|
+----------+-----+
|  Illinois|17556|
|California|16306|
|   Florida|15029|
|     Texas|13577|
|      Ohio|10244|
+----------+-----+
only showing top 5 rows

Top 5 frequent values for city_or_county:


                                                                                

+--------------+-----+
|city_or_county|count|
+--------------+-----+
|       Chicago|10814|
|     Baltimore| 3943|
|    Washington| 3279|
|   New Orleans| 3071|
|  Philadelphia| 2963|
+--------------+-----+
only showing top 5 rows

Top 5 frequent values for address:


                                                                                

+--------------------+-----+
|             address|count|
+--------------------+-----+
|                NULL|17717|
|             0::Male| 2594|
|    0::Male||1::Male| 1253|
|0::Male||1::Male|...|  326|
|  0::Female||1::Male|  295|
+--------------------+-----+
only showing top 5 rows

Top 5 frequent values for congressional_district:
+----------------------+-----+
|congressional_district|count|
+----------------------+-----+
|                     1|36975|
|                     2|27055|
|                     3|20711|
|                     7|19763|
|                     4|18563|
+----------------------+-----+
only showing top 5 rows

Top 5 frequent values for gun_stolen:
+--------------------+------+
|          gun_stolen| count|
+--------------------+------+
|          0::Unknown|121310|
|                NULL|100965|
|0::Unknown||1::Un...|  6116|
|           0::Stolen|  4503|
|0::Unknown||1::Un...|  1484|
+--------------------+------+
only showing top 5 rows

Top 5 frequent values for g

                                                                                

+---------------+------+
|participant_age| count|
+---------------+------+
|           NULL|104130|
|          0::24|  3712|
|          0::23|  3626|
|          0::19|  3625|
|          0::22|  3623|
+---------------+------+
only showing top 5 rows

Top 5 frequent values for participant_age_group:


                                                                                

+---------------------+-----+
|participant_age_group|count|
+---------------------+-----+
|         0::Adult 18+|91790|
|                 NULL|55382|
| 0::Adult 18+||1::...|47625|
| 0::Adult 18+||1::...|13451|
|        0::Teen 12-17| 7182|
+---------------------+-----+
only showing top 5 rows

Top 5 frequent values for participant_gender:


                                                                                

+--------------------+-----+
|  participant_gender|count|
+--------------------+-----+
|             0::Male|90650|
|                NULL|49778|
|    0::Male||1::Male|42116|
|0::Male||1::Male|...|11998|
|  0::Female||1::Male|10265|
+--------------------+-----+
only showing top 5 rows

Top 5 frequent values for participant_status:
+--------------------+-----+
|  participant_status|count|
+--------------------+-----+
|                NULL|41347|
|          0::Injured|41248|
|0::Unharmed, Arre...|24874|
|           0::Killed|20577|
|0::Injured||1::Un...|12297|
+--------------------+-----+
only showing top 5 rows

Top 5 frequent values for participant_type:
+--------------------+-----+
|    participant_type|count|
+--------------------+-----+
|           0::Victim|57114|
|0::Victim||1::Sub...|48986|
|  0::Subject-Suspect|43326|
|                NULL|38597|
|0::Victim||1::Sub...|10622|
+--------------------+-----+
only showing top 5 rows

Top 5 frequent values for state_house_district:
+---

#### Numerical Columns Analysis

In [23]:
num_cols = ["n_killed", "n_injured", "n_guns_involved"]

In [24]:
num_ip_data = ip_data.select(*num_cols)
get_unq_val_cnts(num_ip_data)

                                                                                

Total unique values in the column n_killed : 3488
Total unique values in the column n_injured : 236
Total unique values in the column n_guns_involved : 107


In [25]:
get_most_freq(num_ip_data)

Top 5 frequent values for n_killed:
+--------+------+
|n_killed| count|
+--------+------+
|       0|185835|
|       1| 48436|
|       2|  4604|
|    NULL|  3714|
|       3|   595|
+--------+------+
only showing top 5 rows

Top 5 frequent values for n_injured:
+---------+------+
|n_injured| count|
+---------+------+
|        0|142487|
|        1| 81986|
|        2| 11484|
|     NULL|  6142|
|        3|  2513|
+---------+------+
only showing top 5 rows

Top 5 frequent values for n_guns_involved:
+---------------+------+
|n_guns_involved| count|
+---------------+------+
|              1|127548|
|           NULL|106713|
|              2|  7477|
|              3|  2021|
|              4|   871|
+---------------+------+
only showing top 5 rows



#### Date Columns Analysis

In [27]:
date_cols = ["date"]

In [28]:
date_ip_data = ip_data.select(*date_cols)
get_unq_val_cnts(date_ip_data)

Total unique values in the column date : 8612


In [29]:
get_most_freq(date_ip_data)

Top 5 frequent values for date:
+----------+-----+
|      date|count|
+----------+-----+
|2017-01-01|  342|
|2017-07-04|  248|
|2017-05-28|  242|
|2018-01-01|  242|
|      NULL|  236|
+----------+-----+
only showing top 5 rows



#### Text Columns Analysis

In [31]:
txt_cols = ["notes"]

In [32]:
data = pd.read_csv("../data/gun-violence-data_01-2013_03-2018.csv")

In [33]:
data['address'].value_counts()

address
2375 International Pkwy                160
6000 N Terminal Pkwy                   141
Main Street                            131
3400 E Sky Harbor Blvd                 127
8500 Pe√±a Blvd                          99
                                      ... 
1300 block of Dean Street                1
Bardell and Vine avenues                 1
4235 North Armenia Blvd                  1
8700 block of South Sangamon Street      1
434 Skowhegan Rd                         1
Name: count, Length: 198037, dtype: int64