In [1]:
import pandas as pd

In [2]:
#load file with investment return
df_1year = pd.read_csv('data_1year.csv', index_col=0)
df_1year = df_1year.rename(columns={'cluster':'Cluster'})

#load respective features
df_data_1year = pd.read_excel('data_1year_features.xlsx', index_col=0)
df_data_1year = df_data_1year.drop(['Lat', 'Lon'], axis=1)

#merge datasets
df = pd.merge(df_1year, df_data_1year, on=['Cluster', 'Type', 'Year'])

#drop columns not needed for predictions
df = df.drop(['Cluster', 'Lat', 'Lon'], axis=1)

In [3]:
df.columns

Index(['1y_ret', 'Year', 'Type', 'Parks', 'Number of Primary Schools',
       'Primary School AVG Rating', 'Number of Secondary Schools',
       'Secondary School AVG Rating', 'Number of Post 16 Schools',
       'Post 16 School AVG Rating', 'Services',
       'Number of crimes: Vehicle crime',
       'Number of crimes: Anti-social behaviour',
       'Number of crimes: Violent crime', 'Number of crimes: Other crime',
       'Number of crimes: Burglary', 'Number of crimes: Robbery',
       'Number of crimes: Other theft',
       'Number of crimes: Criminal damage and arson',
       'Number of crimes: Public disorder and weapons',
       'Number of crimes: Drugs', 'Number of crimes: Shoplifting',
       'Number of crimes: Crime Type N/A', 'Number of crimes: Bicycle theft',
       'Number of crimes: Violence and sexual offences',
       'Number of crimes: Public order',
       'Number of crimes: Theft from the person',
       'Number of crimes: Possession of weapons'],
      dtype='object'

In [4]:
#show first rows
df.head()

Unnamed: 0,1y_ret,Year,Type,Parks,Number of Primary Schools,Primary School AVG Rating,Number of Secondary Schools,Secondary School AVG Rating,Number of Post 16 Schools,Post 16 School AVG Rating,...,Number of crimes: Criminal damage and arson,Number of crimes: Public disorder and weapons,Number of crimes: Drugs,Number of crimes: Shoplifting,Number of crimes: Crime Type N/A,Number of crimes: Bicycle theft,Number of crimes: Violence and sexual offences,Number of crimes: Public order,Number of crimes: Theft from the person,Number of crimes: Possession of weapons
0,0.215738,2011,D,64,0,0.0,7,2.0,4,2.0,...,0,0,0,0,0,0,0,0,0,0
1,0.308454,2012,D,64,0,0.0,7,2.0,4,2.0,...,0,0,1,0,0,0,0,0,0,0
2,-0.167827,2013,D,64,0,0.0,7,2.0,4,2.0,...,9,0,4,2,41,0,0,0,0,0
3,-0.19687,2014,D,64,0,0.0,7,2.0,4,2.0,...,9,0,2,0,0,3,3,0,0,0
4,0.033064,2015,D,64,0,0.0,7,2.0,4,2.0,...,2,0,0,0,0,7,8,4,0,0


In [5]:
#one hot encode home type
one_hot = pd.get_dummies(df['Type'])
df = df.drop('Type',axis = 1)
df = df.join(one_hot) 

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
#set X and y
X = df.drop(['1y_ret'], axis=1)
y = df['1y_ret']

In [8]:
#split the data (default test size is 0.25)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
#confirm sizes
print(X_train.shape, X_test.shape)

(981, 30) (328, 30)


In [10]:
#pip install xgboost

In [11]:
from xgboost import XGBRegressor

In [12]:
model = XGBRegressor()
model.fit(X_train, y_train);

In [13]:
y_pred = model.predict(X_test)

In [14]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [15]:
mse_val = mean_squared_error(y_test, y_pred)
mse_val

0.06486490506555308

In [16]:
mae_val = mean_absolute_error(y_test, y_pred)
mae_val

0.18162577519248715

In [17]:
model.feature_importances_

array([0.01471973, 0.02773369, 0.03325384, 0.02824517, 0.02682373,
       0.03909057, 0.01960839, 0.04988816, 0.04040037, 0.03439098,
       0.0341868 , 0.03018505, 0.03736728, 0.02985262, 0.06459614,
       0.03956095, 0.0357185 , 0.01343672, 0.03994908, 0.04764042,
       0.02212172, 0.02961393, 0.03708958, 0.0432229 , 0.06130215,
       0.03010286, 0.02127795, 0.0309482 , 0.0185388 , 0.01913371],
      dtype=float32)

In [18]:
for col,score in zip(X_train.columns,model.feature_importances_):
    print(col,score)

Year 0.01471973
Parks 0.027733695
Number of Primary Schools 0.033253837
Primary School AVG Rating 0.02824517
Number of Secondary Schools 0.026823731
Secondary School AVG Rating 0.039090566
Number of Post 16 Schools 0.019608388
Post 16 School AVG Rating 0.04988816
Services 0.04040037
Number of crimes: Vehicle crime 0.034390982
Number of crimes: Anti-social behaviour 0.0341868
Number of crimes: Violent crime 0.03018505
Number of crimes: Other crime 0.03736728
Number of crimes: Burglary 0.029852618
Number of crimes: Robbery 0.06459614
Number of crimes: Other theft 0.03956095
Number of crimes: Criminal damage and arson 0.035718504
Number of crimes: Public disorder and weapons 0.013436721
Number of crimes: Drugs 0.03994908
Number of crimes: Shoplifting 0.04764042
Number of crimes: Crime Type N/A 0.022121724
Number of crimes: Bicycle theft 0.029613934
Number of crimes: Violence and sexual offences 0.03708958
Number of crimes: Public order 0.0432229
Number of crimes: Theft from the person 0.0

In [19]:
from sklearn.dummy import DummyRegressor

In [20]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
y_pred = dummy_regr.predict(X_test)

In [21]:
mae_val = mean_absolute_error(y_test, y_pred)
mae_val

0.13903655834582032

In [22]:
mse_val = mean_squared_error(y_test, y_pred)
mse_val

0.043469249038032266

In [23]:
#pip install pycaret

In [24]:
#pip install Jinja2

In [25]:
from pycaret.regression import *

In [26]:
df.columns

Index(['1y_ret', 'Year', 'Parks', 'Number of Primary Schools',
       'Primary School AVG Rating', 'Number of Secondary Schools',
       'Secondary School AVG Rating', 'Number of Post 16 Schools',
       'Post 16 School AVG Rating', 'Services',
       'Number of crimes: Vehicle crime',
       'Number of crimes: Anti-social behaviour',
       'Number of crimes: Violent crime', 'Number of crimes: Other crime',
       'Number of crimes: Burglary', 'Number of crimes: Robbery',
       'Number of crimes: Other theft',
       'Number of crimes: Criminal damage and arson',
       'Number of crimes: Public disorder and weapons',
       'Number of crimes: Drugs', 'Number of crimes: Shoplifting',
       'Number of crimes: Crime Type N/A', 'Number of crimes: Bicycle theft',
       'Number of crimes: Violence and sexual offences',
       'Number of crimes: Public order',
       'Number of crimes: Theft from the person',
       'Number of crimes: Possession of weapons', 'D', 'F', 'S', 'T'],
      dt

In [27]:
num_feats = ['Year', 'Parks', 'Number of Primary Schools',
       'Primary School AVG Rating', 'Number of Secondary Schools',
       'Secondary School AVG Rating', 'Number of Post 16 Schools',
       'Post 16 School AVG Rating', 'Services',
       'Number of crimes: Vehicle crime',
       'Number of crimes: Anti-social behaviour',
       'Number of crimes: Violent crime', 'Number of crimes: Other crime',
       'Number of crimes: Burglary', 'Number of crimes: Robbery',
       'Number of crimes: Other theft',
       'Number of crimes: Criminal damage and arson',
       'Number of crimes: Public disorder and weapons',
       'Number of crimes: Drugs', 'Number of crimes: Shoplifting',
       'Number of crimes: Crime Type N/A', 'Number of crimes: Bicycle theft',
       'Number of crimes: Violence and sexual offences',
       'Number of crimes: Public order',
       'Number of crimes: Theft from the person',
       'Number of crimes: Possession of weapons', 'D', 'F', 'S', 'T']
reg = setup(data = df, target = '1y_ret', session_id=123, numeric_features=num_feats) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,1y_ret
2,Original Data,"(1309, 31)"
3,Missing Values,False
4,Numeric Features,30
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(916, 30)"


In [28]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.1359,0.037,0.1902,-0.0142,0.1445,1.8094,0.018
llar,Lasso Least Angle Regression,0.1359,0.037,0.1902,-0.0142,0.1445,1.8094,0.023
dummy,Dummy Regressor,0.1359,0.037,0.1902,-0.0142,0.1445,1.8094,0.014
en,Elastic Net,0.1359,0.037,0.1902,-0.0148,0.1444,1.8032,0.015
br,Bayesian Ridge,0.1367,0.0373,0.191,-0.0234,0.1441,1.8366,0.025
omp,Orthogonal Matching Pursuit,0.1369,0.0374,0.1912,-0.025,0.1437,1.7851,0.019
ridge,Ridge Regression,0.1389,0.0381,0.1928,-0.042,0.1389,2.2705,0.016
lr,Linear Regression,0.139,0.0381,0.1929,-0.0433,0.1387,2.2863,0.681
ada,AdaBoost Regressor,0.1461,0.0416,0.2015,-0.1394,0.1346,3.0667,0.073
par,Passive Aggressive Regressor,0.1512,0.0427,0.2027,-0.1513,0.1404,5.8174,0.025


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=123,
      selection='cyclic', tol=0.0001, warm_start=False)

In [29]:
from sklearn.neural_network import MLPRegressor

In [30]:
mlp_reg = MLPRegressor(random_state=42, max_iter=500).fit(X_train, y_train)
y_pred  = mlp_reg.predict(X_test)

In [31]:
mae_val = mean_absolute_error(y_test, y_pred)
mae_val

0.3983589424285932

In [32]:
mse_val = mean_squared_error(y_test, y_pred)
mse_val

1.8542439377977589