# Analysis on gun data | .xlsx

In [7]:

import pandas as pd
import numpy as np

<a id='wrangling'></a>
## Data Wrangling 

In [8]:
# load the data
gun_data = pd.read_excel('../dataset/gun_data.xlsx')
# shape of the data
gun_data.shape

(12485, 27)

### Data Cleaning (gun_data)

In [9]:
gun_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12485 entries, 0 to 12484
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   month                      12485 non-null  object 
 1   state                      12485 non-null  object 
 2   permit                     12461 non-null  float64
 3   permit_recheck             1100 non-null   float64
 4   handgun                    12465 non-null  float64
 5   long_gun                   12466 non-null  float64
 6   other                      5500 non-null   float64
 7   multiple                   12485 non-null  int64  
 8   admin                      12462 non-null  float64
 9   prepawn_handgun            10542 non-null  float64
 10  prepawn_long_gun           10540 non-null  float64
 11  prepawn_other              5115 non-null   float64
 12  redemption_handgun         10545 non-null  float64
 13  redemption_long_gun        10544 non-null  flo

In [10]:
gun_data.describe()

Unnamed: 0,permit,permit_recheck,handgun,long_gun,other,multiple,admin,prepawn_handgun,prepawn_long_gun,prepawn_other,...,returned_other,rentals_handgun,rentals_long_gun,private_sale_handgun,private_sale_long_gun,private_sale_other,return_to_seller_handgun,return_to_seller_long_gun,return_to_seller_other,totals
count,12461.0,1100.0,12465.0,12466.0,5500.0,12485.0,12462.0,10542.0,10540.0,5115.0,...,1815.0,990.0,825.0,2750.0,2750.0,2750.0,2475.0,2750.0,2255.0,12485.0
mean,6413.629404,1165.956364,5940.881107,7810.847585,360.471636,268.603364,58.89809,4.828021,7.834156,0.165591,...,1.027548,0.076768,0.087273,14.936,11.602909,1.030182,0.40202,0.441818,0.105987,21595.725911
std,23752.338269,9224.200609,8618.58406,9309.84614,1349.478273,783.185073,604.814818,10.907756,16.468028,1.057105,...,4.386296,0.634503,0.671649,71.216021,54.25309,4.467843,1.446568,1.528223,0.427363,32591.418387
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,865.0,2078.25,17.0,15.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4638.0
50%,518.0,0.0,3059.0,5122.0,121.0,125.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12399.0
75%,4272.0,0.0,7280.0,10380.75,354.0,301.0,0.0,5.0,8.0,0.0,...,0.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,25453.0
max,522188.0,116681.0,107224.0,108058.0,77929.0,38907.0,28083.0,164.0,269.0,49.0,...,64.0,12.0,12.0,1017.0,777.0,71.0,28.0,17.0,4.0,541978.0


In [11]:
# Let us see if there is missing datas

for col in gun_data.columns:
    pct_missing = np.mean(gun_data[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

month - 0%
state - 0%
permit - 0%
permit_recheck - 91%
handgun - 0%
long_gun - 0%
other - 56%
multiple - 0%
admin - 0%
prepawn_handgun - 16%
prepawn_long_gun - 16%
prepawn_other - 59%
redemption_handgun - 16%
redemption_long_gun - 16%
redemption_other - 59%
returned_handgun - 82%
returned_long_gun - 83%
returned_other - 85%
rentals_handgun - 92%
rentals_long_gun - 93%
private_sale_handgun - 78%
private_sale_long_gun - 78%
private_sale_other - 78%
return_to_seller_handgun - 80%
return_to_seller_long_gun - 78%
return_to_seller_other - 82%
totals - 0%


In [16]:
# remove null values from dataset
gun_data.dropna(inplace=True)

#  If a column has a high percentage of missing values and is not crucial for your analysis, we might consider dropping the entire column.
gun_data = gun_data.drop([ 'permit', 'permit_recheck', 'handgun',
       'long_gun', 'other', 'multiple', 'admin', 'prepawn_handgun',
       'prepawn_long_gun', 'prepawn_other', 'redemption_handgun',
       'redemption_long_gun', 'redemption_other', 'returned_handgun',
       'returned_long_gun', 'returned_other', 'rentals_handgun',
       'rentals_long_gun', 'private_sale_handgun',
       'private_sale_long_gun', 'private_sale_other',
       'return_to_seller_handgun', 'return_to_seller_long_gun',
       'return_to_seller_other'], axis = 1)

In [17]:
''' 
   I think the data haven cleanned. So, let us 
   confirm; there is no uncleared or missing datas
'''
for col in gun_data.columns:
    pct_missing = np.mean(gun_data[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

month - 0%
state - 0%
totals - 0%
