#### Importing Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
# pyspark packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, desc

#### Setting Spark Session and Loading Data

In [4]:
spark = SparkSession.builder \
    .appName("MIS548 Project") \
    .config("spark.sql.debug.maxToStringFields", "1000") \
    .getOrCreate()

spark

24/10/09 15:50:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [97]:
ip_data = spark.read.option("delimiter", ",") \
                .option("header", True) \
                .csv("../data/gun-violence-data_01-2013_03-2018.csv")

print(f"Number of records in the data : {ip_data.count()}")
print(f"Number of columns: {len(ip_data.columns)}")

Number of records in the data : 246939
Number of columns: 29


In [8]:
ip_data.printSchema()

root
 |-- incident_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city_or_county: string (nullable = true)
 |-- address: string (nullable = true)
 |-- n_killed: string (nullable = true)
 |-- n_injured: string (nullable = true)
 |-- incident_url: string (nullable = true)
 |-- source_url: string (nullable = true)
 |-- incident_url_fields_missing: string (nullable = true)
 |-- congressional_district: string (nullable = true)
 |-- gun_stolen: string (nullable = true)
 |-- gun_type: string (nullable = true)
 |-- incident_characteristics: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- location_description: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- n_guns_involved: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- participant_age: string (nullable = true)
 |-- participant_age_group: string (nullable = true)
 |-- participant_gender: string (nullable = true)
 |-- part

**TO DO:**

All of the data types of the columns seems to be loaded as `string`, so we may need to convert the data types of our columns while reading the file.

#### Missing Values and Duplicate Data Check

In [68]:
def get_null_counts(df):
    total_rows = df.count()
    
    null_counts = df.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns])

    narrow_null_counts = null_counts.selectExpr(
                                    f"'{null_counts.columns[0]}' as column_name",
                                    f"{null_counts.columns[0]} as null_count",
                                    f"({null_counts.columns[0]} / {total_rows} * 100) as null_percentage")

    for c in null_counts.columns[1:]:
        next_col = null_counts.selectExpr(f"'{c}' as column_name", 
                                          f"{c} as null_count",
                                          f"({c} / {total_rows} * 100) as null_percentage")
        narrow_null_counts = narrow_null_counts.union(next_col)
    
    narrow_null_counts = narrow_null_counts.orderBy(desc("null_count"))
    
    return narrow_null_counts

In [70]:
narrow_null_counts = get_null_counts(ip_data)
narrow_null_counts.show(n=29, truncate=False)

                                                                                

+---------------------------+----------+-------------------+
|column_name                |null_count|null_percentage    |
+---------------------------+----------+-------------------+
|participant_relationship   |231691    |93.82519569610308  |
|location_description       |204849    |82.95530475137585  |
|participant_name           |133328    |53.99228149462013  |
|n_guns_involved            |106713    |43.21431608615893  |
|gun_type                   |106690    |43.205002045039464 |
|participant_age            |104130    |42.168308772611866 |
|gun_stolen                 |100965    |40.88661572291132  |
|notes                      |88279     |35.7493146080611   |
|participant_age_group      |55382     |22.42740109905685  |
|state_house_district       |52070     |21.08617917785364  |
|participant_gender         |49778     |20.1580147323833   |
|state_senate_district      |45852     |18.56814840912128  |
|participant_status         |41347     |16.743811224634424 |
|participant_type       

**Inference:**
Almost all the columns are having missing values. Some with signficant amount of missing values such as `participant_relationship`, `location_description`, `participant_name`, `n_guns_involved`, `gun_type`, `participant_age`, `gun_stolen`. From the above mentioned columns we might not use any of them for analysis except for `participant_age`, `n_guns_involved`, `gun_type`. So we might need to drop these columns instead of handling this missing data.

For the rest of the missing data we might need to handle those missing data.

**TO DO:**
1) We might need to drop the rows with missing information for the key columns such as `date`, `city_or_county`, `state` to allow for authenticity of the data. Instead of imputing the data.
2) We might impute for some columns with missing data and mention it clearly.
3) Or we might actually replace NA values with some key word such as `MISSING` or `UNKNOWN` for categorical data and `0` or other numeric values for the numerical data.


In [75]:
def check_duplicates_except(df, column_to_exclude="incident_id"):
    columns_to_check = [col for col in df.columns if col != column_to_exclude]
    
    df_duplicates = df.groupBy(columns_to_check).count().filter("count > 1")
    
    return df_duplicates

In [89]:
ip_data_dup_chk = check_duplicates_except(ip_data)

print(f"Number of Duplicate Rows: {ip_data_dup_chk.count()}")
print(f"Frequency of the Duplicated Row: {ip_data_dup_chk.select('count').first()[0]}")

                                                                                

Number of Duplicate Rows: 1


                                                                                

Frequency of the Duplicated Row: 108


**TO DO:**

The duplicate row is nothing but containing all `NULL` values in all columns.

#### Categorical Columns Analysis

#### Numerical Columns Analysis

#### Date Columns Analysis

In [26]:
pandas_df = pd.read_csv("../data/gun-violence-data_01-2013_03-2018.csv")

In [28]:
pandas_df.head()

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0
