# Exploratory Analysis of ML models

In [1]:
# import libraries
import os
import pandas as pd
import numpy as np

import seaborn as sns # install seaborn with either pip or conda, used for heat maps
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import xgboost as xgb

# import DataSet
from util.data_util import DataSet

## load data 

make sure that the file `cleaned_data.zip` is in the `data` directory before starting.

- `ds.load_data()` is used to load the dataset sequentially. if you specify the argument `num_years` then the function will only load that number of years starting from 2001. By default, the function downloads all years (this may take a while). 

- `ds.load_subset()` will load a selected subset of the years, specified by a list of years ex. `['2009, '2010', '2011']`

**NOTE**: using `load_data` on the entire dataset takes a very long time (30 min +), reading in smaller chunks using `load_subset` and then using `pd.concat` may be a better work around

In [10]:
cleaned_data = 'cleaned_data.zip'
ds = DataSet(cleaned_data)
data = ds.load_subset(['2001', '2002'])
data.head()

C:\Users\ajshe\Documents\year4\fall-2021\stat451\CrimeAnalysisML\data\cleaned_data.zip
downloading: data/2001/data_2001.csv ...
downloading: data/2002/data_2002.csv ...
done.
build DataFrame ...


Unnamed: 0,Block,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,X Coordinate,...,Cluster,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Type
0,051XX W WASHINGTON BLVD,FIRST DEGREE MURDER,STREET,1,0,1533,15.0,28.0,25.0,1141945.0,...,20,80.14770933112915,-55.05657313177791,91.66643204543084,-32.43681229871984,96.9382389017779,-7.606536311870862,97.23621524947367,2.6963012843505108,11
1,075XX S HALSTED ST,FIRST DEGREE MURDER,VACANT LOT,0,0,621,6.0,17.0,71.0,1172275.0,...,21,79.98471696289465,-55.02361440474977,91.50046316120792,-32.44716214218721,96.78060400674975,-7.659489401105354,97.08322764410416,2.696972071582787,11
2,059XX W WASHINGTON BLVD,FIRST DEGREE MURDER,HOUSE,0,1,1512,15.0,29.0,25.0,1137038.0,...,20,80.1571295059187,-55.07193661712959,91.6795075981564,-32.44921416535911,96.95407875512956,-7.615131393081306,97.25267920931412,2.6963766512346106,11
3,019XX S SAWYER AVE,FIRST DEGREE MURDER,STREET,1,0,1022,10.0,24.0,29.0,1155001.0,...,33,80.1007216974968,-55.02864195899923,91.61381635712198,-32.42199415204273,96.88358083099922,-7.60584102350321,97.18167035152032,2.696338149338245,11
4,006XX N TROY ST,FIRST DEGREE MURDER,APARTMENT,1,0,1221,12.0,27.0,23.0,1155261.0,...,14,80.1327790039156,-55.00860465047042,91.63959530025332,-32.39434255680956,96.90132461847044,-7.572459552084389,97.19675332274292,2.6959814729985023,11


In [11]:
data_68 = ds.load_subset(['2004', '2005'])
data_68.head()

C:\Users\ajshe\Documents\year4\fall-2021\stat451\CrimeAnalysisML\data\cleaned_data.zip
downloading: data/2004/data_2004.csv ...
downloading: data/2005/data_2005.csv ...
done.
build DataFrame ...
(1265910, 39)


Unnamed: 0,Block,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,X Coordinate,...,Cluster,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Type
0,082XX S JEFFERY BLVD,NON-AGGRAVATED,RESIDENCE - PORCH / HALLWAY,0,0,414,4.0,8.0,46.0,1190954.0,...,11,79.9407389010494,-54.97010865321537,91.44413530796184,-32.40686191489738,96.71576501221536,-7.635141091950615,97.0166716672555,2.69677461204572,7
1,022XX N AVERS AVE,FIRST DEGREE MURDER,AUTO,1,0,2525,25.0,30.0,22.0,1150344.0,...,38,80.16671990490491,-55.00954383943855,91.67262277307832,-32.386465192105675,96.93118789543854,-7.556302463095101,97.22526880258336,2.695791866067707,11
2,079XX S HALSTED ST,FIRST DEGREE MURDER,STREET,1,0,621,6.0,17.0,71.0,1172350.0,...,21,79.97826028096921,-55.027115485606615,91.4951326317882,-32.452215038857226,96.7767628966066,-7.665749767030789,97.079892646401,2.6970394789508227,11
3,055XX S KOLIN AVE,FIRST DEGREE MURDER,HOUSE,1,0,813,8.0,13.0,62.0,1148308.0,...,27,80.05913310008754,-55.08169788727088,91.58737674149756,-32.48400616446668,96.87409201327088,-7.672583103912459,97.17745744195058,2.6970304909673897,11
4,087XX S MANISTEE AVE,FIRST DEGREE MURDER,STREET,0,0,423,4.0,7.0,46.0,1196147.0,...,11,79.92360814209718,-54.95814917433793,91.42449292456295,-32.39974371205459,96.69494960033792,-7.633349258902832,96.99577980056804,2.696773084991499,11


In [12]:
dfs = [data, data_68]
df = pd.concat(dfs)
df.shape

(1265910, 39)

array([['051XX W WASHINGTON BLVD', 'FIRST DEGREE MURDER', 'STREET', ...,
        '97.23621524947367', '2.6963012843505108', '11'],
       ['075XX S HALSTED ST', 'FIRST DEGREE MURDER', 'VACANT LOT', ...,
        '97.08322764410417', '2.696972071582787', '11'],
       ['059XX W WASHINGTON BLVD', 'FIRST DEGREE MURDER', 'HOUSE', ...,
        '97.25267920931412', '2.6963766512346106', '11'],
       ...,
       ['004XX E 78TH ST', 'FIRST DEGREE MURDER', 'STREET', ...,
        '97.05436558030802', '2.696876431666843', '11'],
       ['079XX S ASHLAND AVE', 'AGGRAVATED - KNIFE / CUTTING INSTRUMENT',
        'VEHICLE NON-COMMERCIAL', ..., '97.09753050672109',
        '2.6971231321239464', '7'],
       ['006XX W DIVISION ST', 'NON-AGGRAVATED', 'CHA APARTMENT', ...,
        '97.14730592654858', '2.6956132344085795', '7']], dtype=object)