In [None]:
 import sklearn
import numpy as np
import os
np.random.seed(42)
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

CRIME_PATH = os.path.join("drive", "MyDrive")

In [None]:
import pandas as pd
def load_crime_data(crime_path=CRIME_PATH):
    """Load Housing Data into Workspace from a CSV"""
    if not os.path.isdir(crime_path):
        os.makedirs(crime_path)
    csv_path = os.path.join(CRIME_PATH, "Crime_Data.csv")
    return pd.read_csv(csv_path)

crime = load_crime_data()
# Display the first n rows of the data. (n=5 by default, we will use 10 rows)
crime.head(10)

Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,ReportingOfficer
0,1,Vandalism,202180000147,300.0,15TH ST NW,CPD,2021/10/05 00:05:05+00,2005,"Pendleton, Eric"
1,2,Suspicious Activity,202100028143,400.0,ALTAMONT ST,CPD,2021/10/04 21:38:10+00,1738,"Green, Bryan"
2,3,Larceny - All Other,202100028138,400.0,"4TH ST NW, 224",CPD,2021/10/04 19:57:38+00,1557,"Arreguin, Arron"
3,4,Larceny - Of Veh Parts/Access,202100028134,,GARRETT ST / 6TH ST SE,CPD,2021/10/04 19:44:49+00,1544,"Via, Ryan"
4,5,Larceny - Theft from Building,202100028128,200.0,"14TH ST NW, 1",CPD,2021/10/04 19:25:05+00,1525,"Chan, William"
5,6,Hit and Run,202100028123,800.0,W MAIN ST,CPD,2021/10/04 18:53:25+00,1453,"Tennyson, James"
6,7,Vandalism,202180000146,2100.0,TARLETON DR,CPD,2021/10/04 18:22:42+00,1422,"Moje, Ashley"
7,8,Accident - w/out Injuries,202100028116,,WATSON AVE @ 250 BYPASS,CPD,2021/10/04 18:10:48+00,1410,"Burchardt, Rudy"
8,9,Suspicious Activity,202100028113,1600.0,CEDAR HILL RD,CPD,2021/10/04 17:44:28+00,1344,"Hamill, Annmarie"
9,10,Fraud-impersonation,202100028118,300.0,MEADE AVE,CPD,2021/10/04 17:44:17+00,1344,"Turner, Devon"


Categorical attributes are:
Offense
StreetName
Agency

Going to drop:
RecordID
IncidentID
DateReported
Agency
Reporting Officer

In [None]:
import datetime
crime["Year"]=crime["DateReported"].str[:4]
crime["Month"]=crime["DateReported"].str[5:7]
crime["Day"]=crime["DateReported"].str[8:10]
crime["DOW"] = pd.to_datetime(crime["Year"]+"/"+crime["Month"]+"/"+crime["Day"])

In [None]:
from pandas import Timestamp
crime["DOW"]=crime["DOW"].apply(Timestamp.isoweekday)
crime["DOW"]

0        2
1        1
2        1
3        1
4        1
        ..
25657    5
25658    5
25659    5
25660    5
25661    5
Name: DOW, Length: 25662, dtype: int64

In [None]:
crime=crime.drop("RecordID",axis=1)
crime=crime.drop("IncidentID",axis=1)
crime=crime.drop("DateReported",axis=1)
crime=crime.drop("Day",axis=1)
crime=crime.drop("Agency",axis=1)
crime=crime.drop("ReportingOfficer", axis=1)
crime

Unnamed: 0,Offense,BlockNumber,StreetName,HourReported,Year,Month,DOW
0,Vandalism,300.0,15TH ST NW,2005,2021,10,2
1,Suspicious Activity,400.0,ALTAMONT ST,1738,2021,10,1
2,Larceny - All Other,400.0,"4TH ST NW, 224",1557,2021,10,1
3,Larceny - Of Veh Parts/Access,,GARRETT ST / 6TH ST SE,1544,2021,10,1
4,Larceny - Theft from Building,200.0,"14TH ST NW, 1",1525,2021,10,1
...,...,...,...,...,...,...,...
25657,Assault Simple,100.0,"LANKFORD AVE, 3",720,2016,10,5
25658,Assist Citizen - Mental/TDO/ECO,2500.0,WILLARD DR,659,2016,10,5
25659,Assault Simple,100.0,"14TH ST NW, B",608,2016,10,5
25660,Assault Simple,1500.0,UNIVERSITY AVE,531,2016,10,5


In [None]:
crime['HourReported']=crime['HourReported'].div(10000).round(2).div(1/100)

In [None]:
crime.describe()

Unnamed: 0,BlockNumber,HourReported,DOW
count,24610.0,25662.0,25662.0
mean,854.896384,13.190203,3.940418
std,613.29875,6.138556,1.979907
min,0.0,0.0,1.0
25%,300.0,9.0,2.0
50%,700.0,14.0,4.0
75%,1200.0,18.0,6.0
max,9100.0,24.0,7.0


In [None]:
from sklearn.model_selection import train_test_split
# Split the data into train set (80%) and test set (20%)
train_set, test_set = train_test_split(crime, test_size=0.2, random_state=42)

In [None]:
print( train_set.shape )
print( test_set.shape )

(20529, 7)
(5133, 7)


In [None]:
crime=train_set.drop("HourReported",axis=1)
crime_test=test_set.drop("HourReported",axis=1)
test_labels=test_set["HourReported"].copy()
crime_labels=train_set["HourReported"].copy()

In [None]:
sample_incomplete_rows = crime[crime.isnull().any(axis=1)]
sample_incomplete_rows

Unnamed: 0,Offense,BlockNumber,StreetName,Year,Month,DOW
4949,Harassment,,ELLIEWOOD AVE,2020,08,7
3106,Assault Intimidation,,BARRACKS RD / EMMET ST N,2021,02,3
6717,Drug/Narcotics Violation,,GARRETT ST / GLEASON ST,2020,03,2
3208,Lost/FoundProperty,,14TH ST NW / W MAIN ST,2021,02,7
4879,Vandalism,,CHANCELLOR ST,2020,09,6
...,...,...,...,...,...,...
161,Lost/FoundProperty,,8TH ST NW @ ANDERSON ST,2021,09,5
21870,Larceny - Theft from Building,,MORTON DR,2017,05,5
8666,Hit and Run,,5TH ST SW / 5TH STREET STATION PKWY,2019,10,4
769,Assault Simple,,PROSPECT AVE,2021,08,4


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="most_frequent")

In [None]:
imputer.fit(crime)

SimpleImputer(strategy='most_frequent')

In [None]:
imputer.statistics_

array(['Assault Simple', 100.0, 'E MARKET ST', '2017', '10', 5],
      dtype=object)

In [None]:
X = imputer.transform(crime)
print(X)

[['Larceny - Of Veh Parts/Access' 600.0 'LEXINGTON AVE' '2021' '07' 4]
 ['Hit and Run' 600.0 'RAINIER RD' '2016' '10' 4]
 ['Vandalism' 500.0 '7 1/2 ST SW' '2017' '06' 4]
 ...
 ['Assault Simple' 700.0 'PROSPECT AVE, E' '2021' '08' 5]
 ['Assist Citizen - Mental/TDO/ECO' 700.0 'RIDGE ST' '2018' '05' 4]
 ['Vandalism' 700.0 'PROSPECT AVE, C' '2017' '01' 3]]


In [None]:
crime_tr=pd.DataFrame(X, columns=crime.columns,
                          index=crime.index)
crime=crime_tr.copy()

In [None]:
crime_tr.loc[sample_incomplete_rows.index.values]

Unnamed: 0,Offense,BlockNumber,StreetName,Year,Month,DOW
4949,Harassment,100,ELLIEWOOD AVE,2020,08,7
3106,Assault Intimidation,100,BARRACKS RD / EMMET ST N,2021,02,3
6717,Drug/Narcotics Violation,100,GARRETT ST / GLEASON ST,2020,03,2
3208,Lost/FoundProperty,100,14TH ST NW / W MAIN ST,2021,02,7
4879,Vandalism,100,CHANCELLOR ST,2020,09,6
...,...,...,...,...,...,...
161,Lost/FoundProperty,100,8TH ST NW @ ANDERSON ST,2021,09,5
21870,Larceny - Theft from Building,100,MORTON DR,2017,05,5
8666,Hit and Run,100,5TH ST SW / 5TH STREET STATION PKWY,2019,10,4
769,Assault Simple,100,PROSPECT AVE,2021,08,4


In [None]:
from sklearn.preprocessing import OrdinalEncoder
crime_catO=crime[["Offense"]]
ordinal_encoderO = OrdinalEncoder()
crime_catO_encoded = ordinal_encoderO.fit_transform(crime_catO)
len(np.unique(crime_catO_encoded))

119

In [None]:
ordinal_encoderO.categories_

In [None]:
crime_catS=crime[["StreetName"]]
ordinal_encoderS = OrdinalEncoder()
crime_catS_encoded = ordinal_encoderS.fit_transform(crime_catS)
crime_catS_encoded[:10]

array([[1616.],
       [2167.],
       [ 409.],
       [1292.],
       [ 227.],
       [2247.],
       [2008.],
       [1751.],
       [  47.],
       [1178.]])

In [None]:
ordinal_encoderS.categories_[0]

array(['-78.4659036420000000', '-78.4693344960000000',
       '-78.4748563960000000', ..., 'YELLOWSTONE DR, 202-140',
       'YORKTOWN DR', 'ZAN RD'], dtype=object)

In [None]:
from sklearn.preprocessing import OneHotEncoder

catO_encoder = OneHotEncoder()
crime_catO_1hot = catO_encoder.fit_transform(crime_catO)
catS_encoder = OneHotEncoder()
crime_catS_1hot = catS_encoder.fit_transform(crime_catS)


In [None]:
crime_catO_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [None]:
crime_catS_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
cat_encoderO = OneHotEncoder(sparse=False)
housing_catO_1hot = cat_encoderO.fit_transform(crime_catO)
housing_catO_1hot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [None]:
cat_encoderO.categories

'auto'

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('std_scaler', StandardScaler()),
    ])
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('one_hot_encoder',OneHotEncoder(handle_unknown='ignore')) 
    ])

In [None]:
from sklearn.compose import ColumnTransformer
num_attribs = ['BlockNumber', 'Year','Month','DOW']
cat_attribs = ["Offense","StreetName"]



full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

crime_prep = full_pipeline.fit_transform(crime)
crime_test_prep=full_pipeline.transform(crime_test)
crime_prep

<20529x2896 sparse matrix of type '<class 'numpy.float64'>'
	with 123174 stored elements in Compressed Sparse Row format>

In [None]:
print(crime.shape)
print(crime_prep.shape)

(20529, 6)
(20529, 2896)


In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(crime_prep, crime_labels)

LinearRegression()

In [None]:
from sklearn.metrics import *

crime_predictions = lin_reg.predict(crime_prep)
print(crime_predictions)
lin_mse = mean_squared_error(crime_labels, crime_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse/24 #rmse when predicting with x train

[11.76432747 12.10173994 13.14154007 ... 13.32188835 14.01639029
 12.99791235]


0.23246921427294068

In [None]:
print(len(crime_predictions))

20529


In [None]:
print(crime_predictions[0:20000])

[ 1.17643275e+01  1.21017399e+01  1.31415401e+01 ... -1.53614117e-04
  1.57006768e+01  1.41261088e+01]


In [None]:
lin_reg = LinearRegression()
lin_reg.fit(crime_prep, crime_labels)
crime_predictions2 = lin_reg.predict(crime_test_prep)
print(crime_predictions2)
lin_mse = mean_squared_error(test_labels, crime_predictions2)
lin_rmse = np.sqrt(lin_mse)
lin_rmse/24 #rmse when predicting on x test

[13.25434531 13.93515612 13.70893313 ... 11.96150367 12.97401568
 13.30602056]


0.2618601572452004

In [None]:
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

lin_scores = cross_val_score(lin_reg, crime_prep, crime_labels,
                             scoring="neg_mean_squared_error", cv=5)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)


Scores: [6.23540537 6.36577749 6.3411587  6.20048745 6.29815023]
Mean: 6.288195845068263
Standard deviation: 0.062293919690075035


In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(crime_prep, crime_labels)

RandomForestRegressor(random_state=42)

In [None]:
crime_predict = forest_reg.predict(crime_prep)
forest_mse = mean_squared_error(crime_labels, crime_predict)
forest_rmse = np.sqrt(forest_mse)
forest_rmse/24 #rmse using train data

0.09947928061209453

In [None]:
forest_scores = cross_val_score(forest_reg, crime_prep, crime_labels,
                             scoring="neg_mean_squared_error", cv=3)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
crime_predict2 = forest_reg.predict(crime_test_prep)

forest_mse2 = mean_squared_error(test_labels, crime_predict2)
forest_rmse2 = np.sqrt(forest_mse2)
forest_rmse2/24 #rmse with test data

0.2599497559967732

In [None]:
from sklearn.linear_model import Ridge
clf=Ridge(alpha=1, solver='sparse_cg')
clf.fit(crime_prep, crime_labels)

Ridge(alpha=1, solver='sparse_cg')

In [None]:
crime_predict3 = clf.predict(crime_prep)
ridge_mse=mean_squared_error(crime_labels, crime_predict3)
ridge_rmse = np.sqrt(ridge_mse)
ridge_rmse/24

0.23563705041413377

In [None]:
crime_predict4 = clf.predict(crime_test_prep)
ridge_mse2=mean_squared_error(test_labels, crime_predict4)
ridge_rmse2 = np.sqrt(ridge_mse2)
ridge_rmse2/24

0.2561522942245249

In [None]:
from sklearn.neighbors import RadiusNeighborsRegressor
neigh = RadiusNeighborsRegressor(radius=10)
neigh.fit(crime_prep, crime_labels)
crime_predict5 = neigh.predict(crime_prep)
neigh_mse=mean_squared_error(crime_labels, crime_predict5)
neigh_rmse = np.sqrt(neigh_mse)
neigh_rmse/24

0.2559313248085458

In [None]:
crime_predict6 = neigh.predict(crime_test_prep)
neigh_mse2=mean_squared_error(test_labels, crime_predict6)
neigh_rmse2 = np.sqrt(neigh_mse2)
neigh_rmse2/24

0.25512075983607896

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 300], 'max_features': [16,128,512,1028]}
#param_grid={'bootstrap': [False], 'n_estimators': [3, 100], 'max_features': [2, 3, 5]}
  

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(crime_prep, crime_labels)

GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_features': [16, 128, 512, 1028],
                         'n_estimators': [100, 300]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [None]:
grid_search.best_params_

{'max_features': 16, 'n_estimators': 300}

In [None]:
grid_search.best_estimator_

RandomForestRegressor(max_features=16, n_estimators=300, random_state=42)

In [None]:
cvres = grid_search.cv_results_
# zip() is to combine the column "mean_test_score" with struct "params"
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score)/24, params)#3.107130194228893 FOURTH

0.2558862481375839 {'max_features': 16, 'n_estimators': 100}
0.25501099205698724 {'max_features': 16, 'n_estimators': 300}
0.25651989133535685 {'max_features': 128, 'n_estimators': 100}
0.2561139704758456 {'max_features': 128, 'n_estimators': 300}
0.2591116452859979 {'max_features': 512, 'n_estimators': 100}
0.2583169543442433 {'max_features': 512, 'n_estimators': 300}
0.2603430228842101 {'max_features': 1028, 'n_estimators': 100}
0.25973260054123265 {'max_features': 1028, 'n_estimators': 300}


In [None]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(crime_test_prep)
final_mse = mean_squared_error(test_labels, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse/24

0.25383597463816887

In [None]:
from sklearn.svm import SVR 
regr = SVR(C=1.0,epsilon=.2)
regr.fit(crime_prep, crime_labels)
pred6=regr.predict(crime_test_prep)
mse6=mean_squared_error(test_labels, pred6)
rmse6=np.sqrt(mse6)
rmse6/24

0.2516814679888257

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 10, 100], 'epsilon': [.0001, .001, .01, .1, .3]}
#param_grid={'bootstrap': [False], 'n_estimators': [3, 100], 'max_features': [2, 3, 5]}
  

regr2 = SVR()
grid_search2 = GridSearchCV(regr2, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search2.fit(crime_prep, crime_labels)

GridSearchCV(cv=3, estimator=SVR(),
             param_grid={'C': [1, 10, 100],
                         'epsilon': [0.0001, 0.001, 0.01, 0.1, 0.3]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [None]:
final_model2 = grid_search2.best_estimator_
final_predictions2 = final_model2.predict(crime_prep)
final_mse2 = mean_squared_error(crime_labels, final_predictions2)
final_rmse2 = np.sqrt(final_mse2)
final_rmse2/24

0.24248267204788373

In [None]:
final_model2 = grid_search2.best_estimator_
final_predictions2 = final_model2.predict(crime_test_prep)
final_mse2 = mean_squared_error(test_labels, final_predictions2)
final_rmse2 = np.sqrt(final_mse2)
final_rmse2/24

0.2516108087793398