# Relationship Between Police Calls And Price of M²

### 1. Import Packages

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
from geopandas.tools import sjoin
import re
import shapely
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

from shapely.geometry import Polygon
from shapely.geometry import Point
import sys

OSError: Could not find libspatialindex_c library file

### 2. Graphics

In [None]:
%matplotlib inline
sns.set()

### 3. Read the Data

In [None]:
regions = pd.read_csv("./data/regions.csv", dtype={"number" : "str"},sep=",")
calls = pd.read_csv("./data/policecalls.csv", dtype={"number" : "str"},sep=",")

### 4. Pre-processing

In [None]:
calls["lat"] = calls.lat.apply(pd.to_numeric, args=('coerce',))
calls["lng"] = calls.lng.apply(pd.to_numeric, args=('coerce',))

In [None]:
for num in range(0, len(regions)):
    data = regions.THE_GEOM[num].replace('POLYGON ((', '').replace('))', '').strip()
    res = []
    for rec in data.split(', '):
        res.append(tuple(float(val) for val in rec.split()))
    poly = Polygon(res)
    regions["THE_GEOM"][num] = poly


In [None]:
calls["geometry"] = calls.apply(lambda z: Point(z.lng, z.lat), axis=1)
regions["geometry"] = gpd.GeoSeries(regions["THE_GEOM"])
calls_gdf = gpd.GeoDataFrame(calls)
regions_gdf = gpd.GeoDataFrame(regions)
calls_gdf.crs
regions_gdf.crs
calls_gdf.crs = {"init": "epsg:4326"}
regions_gdf.crs = {"init": "epsg:4326"}
calls_gdf = calls_gdf[calls_gdf.is_valid == True]

### 5. Columns Of Interest

In [None]:
columns_of_interest = ["NOME", "VAL_M2_RES", "AREA", "geometry"]
regions_gdf = regions_gdf[columns_of_interest]
regions_gdf.columns = ["BAIRRO", "VAL_M2_RES", "AREA", "geometry"]
columns_of_interest = ["date", "geometry"]
calls_gdf = calls_gdf[columns_of_interest]
calls_gdf.columns = ["DATA", "geometry"]

### 6. Visualize Dada

In [None]:
calls_gdf.head(1)

In [None]:
regions_gdf.head(1)

### 7. Merge Data Frames

In [None]:
merged_gdf = gpd.sjoin(calls_gdf, regions_gdf, how="inner", op="intersects")

### 8. Visualize Merged Data Frame

In [None]:
merged_gdf.head()

### 9. Group Data Frame

In [None]:
grouped_gdf = merged_gdf.groupby(["BAIRRO"]).agg({'geometry': "count",
                                    'VAL_M2_RES': "mean",
                                    'AREA': "mean"})

In [None]:
grouped_gdf.columns = ["NUM_CALLS", "VAL_M2_RES", "AREA"]

In [None]:
grouped_gdf.head()

In [None]:
grouped_gdf["CALLS_BY_M2"] = grouped_gdf["NUM_CALLS"] / grouped_gdf["AREA"]

In [None]:
grouped_gdf.head()

### 10. Visualize the new dataset

In [None]:
grouped_gdf.plot(kind = "scatter", 
                  x = "NUM_CALLS", y = "VAL_M2_RES", 
                  title = "Calls By M2 x VAL M2")
plt.show()

In [None]:
X = np.c_[grouped_gdf["NUM_CALLS"]]
y = np.c_[grouped_gdf["VAL_M2_RES"]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1)

In [None]:
lr_model = linear_model.LinearRegression()

In [None]:
# train the model
lr_model.fit(X_train, y_train)

In [None]:
y_train_pred = lr_model.predict(X_train)

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_train

In [None]:
r2_train = r2_score(y_train, y_train_pred)
r2_train

In [None]:
print("Mean Squared Error = %.2f" % mse_train)

In [None]:
print("Variance score = %.2f" % r2_train)

In [None]:
plt.scatter(X_train, y_train, color = "green")
plt.plot(X_train, y_train_pred, color = "blue", linewidth = 1)
plt.title("Fitting a linear model to the training set")
plt.xlabel("Num Calls")
plt.ylabel("Val M²")
plt.show()

In [None]:
polyf = PolynomialFeatures(10) # of second order, quadratic
X_train_f = polyf.fit_transform(X_train)

In [None]:
lrf_model = linear_model.LinearRegression()
lrf_model.fit(X_train_f, y_train)

In [None]:
y_train_f_pred = lrf_model.predict(X_train_f)

In [None]:
mse_f_train = mean_squared_error(y_train, y_train_f_pred)
mse_f_train

In [None]:
r2_f_train = r2_score(y_train, y_train_f_pred)
r2_f_train

In [None]:
plt.scatter(X_train, y_train, color = "green")
plt.scatter(X_train_f[:,1], y_train_f_pred, color = "blue")
plt.title("Fitting a Linear Model with Polynomial Features to the Training Set")
plt.xlabel("Num Calls")
plt.ylabel("Value M²")
plt.show()