In [None]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
import pandas as pd
import os

def load_police_data():
    csv_path = os.path.join(os.path.join("police_data"), "crimes_2012_to_2017.csv")
    return pd.read_csv(csv_path, index_col=0)

In [None]:
police_data = load_police_data()
police_data = police_data.loc[:, ~police_data.columns.str.contains('^Unnamed')]
police_data = police_data.drop('Location', 1)
police_data = police_data.drop('ID', 1)
police_data = police_data.drop('Case Number', 1)
police_data = police_data.drop('Block', 1)
police_data = police_data.drop('Domestic', 1)
police_data = police_data.drop('Beat', 1)
police_data = police_data.drop('FBI Code', 1)
police_data = police_data.drop('X Coordinate', 1)
police_data = police_data.drop('Y Coordinate', 1)
police_data = police_data.drop('Updated On', 1)
police_data = police_data.drop('IUCR', 1)

In [None]:
# Get names of indexes for which column Age has value 30
indexNames = police_data[ police_data['Longitude'] > 90 ].index

police_data.drop(indexNames , inplace=True)

In [None]:
# Get names of indexes for which column Age has value 30
indexNames2 = police_data[ police_data['Latitude'] < 38 ].index 

police_data.drop(indexNames2 , inplace=True)

In [None]:
police_data.head()

In [None]:
police_data.info()

In [None]:
pd.set_option('display.max_rows', 500)
police_data["Location Description"].value_counts()

In [None]:
police_data["Date"].value_counts()

In [None]:
police_data["Arrest"].value_counts()

In [None]:
police_data["Primary Type"].value_counts()

In [None]:
police_data["Description"].value_counts()

In [None]:
police_data["District"].value_counts()

In [None]:
police_data["Community Area"].value_counts()

In [None]:
police_data.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
police_data.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

In [None]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [None]:
import numpy as np

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(police_data, 0.2)
print(len(train_set), "train +", len(test_set), "test")

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
import hashlib

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

In [None]:
crime_with_id = police_data.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(crime_with_id, 0.2, "index")

In [None]:
# police_data = pd.DataFrame(police_data)
# police_data = police_data.apply(pd.to_numeric, errors='coerce')
# police_data = police_data.dropna()
# police_data = police_data.dropna(axis=0, subset=['Longitude'])
# police_data = police_data.dropna(axis=0, subset=['Latitude'])

In [None]:
# crime_with_id["id"] = police_data["Longitude"] * 1000 + police_data["Latitude"]
# pd.set_option('display.max_rows', 50)
# crime_with_id["id"]
# police_data["Longitude"]
# police_data["Latitude"]
# train_set, test_set = split_train_test_by_id(crime_with_id, 0.2, "id")

In [None]:
police_data.head()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(police_data, test_size=0.2, random_state=42)

In [None]:
test_set.head()


In [None]:
# police_data['District'] = police_data['District'].astype(int)
police_data["District"].hist()
# police_data['Arrest']

In [None]:
police_data["Ward"].hist()

In [None]:
police_data["Community Area"].hist()

In [None]:
police_data.plot(kind="scatter", x="Longitude", y="Latitude")
save_fig("bad_visualization_plot")