In [37]:
import boto3
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

In [3]:
filename = 'clean_data.csv'
bucketname = 'ahmad-bucket123'

s3 = boto3.client('s3')

s3.download_file(bucketname, filename, filename)

In [5]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,Price,Address,Bedrooms,Description,Postcode
0,11251000,"71 Bondway, London, SW8 1SF",5,** THE HEIGHT OF LUXURY IN AN EXCEPTIONAL SETT...,SW8
1,6500000,"16 Wotton Road, Cricklewood, NW2 6PX",5,A unique opportunity to secure this superbly r...,NW2
2,3500000,"Kensington Court Place, London, W8 5BJ",3,We are pleased to offer this superb three bedr...,W8
3,2571000,"71 Bondway, London, SW8 1SF",3,** CITY VIEWS OF THE OVAL CRICKET GROUND AND C...,SW8
4,2495000,"Princes Gate, London, SW7 2QG",3,We are pleased to offer this exceptional 3 bed...,SW7


In [7]:
df.dtypes

Price           int64
Address        object
Bedrooms        int64
Description    object
Postcode       object
dtype: object

In [8]:
df['Postcode'] = df['Postcode'].astype('category')

print(df.dtypes)

Price             int64
Address          object
Bedrooms          int64
Description      object
Postcode       category
dtype: object


In [9]:
category_count = df['Postcode'].value_counts()

print(category_count)

SW8     12
SW19     5
SW1      5
SW6      5
SE23     5
        ..
SE17     1
SE2      1
SE27     1
SE4      1
WC1      1
Name: Postcode, Length: 95, dtype: int64


In [11]:
df['Area'] = df['Postcode'].str.replace('\d+', '')

df.head()

  df['Area'] = df['Postcode'].str.replace('\d+', '')


Unnamed: 0,Price,Address,Bedrooms,Description,Postcode,Area
0,11251000,"71 Bondway, London, SW8 1SF",5,** THE HEIGHT OF LUXURY IN AN EXCEPTIONAL SETT...,SW8,SW
1,6500000,"16 Wotton Road, Cricklewood, NW2 6PX",5,A unique opportunity to secure this superbly r...,NW2,NW
2,3500000,"Kensington Court Place, London, W8 5BJ",3,We are pleased to offer this superb three bedr...,W8,W
3,2571000,"71 Bondway, London, SW8 1SF",3,** CITY VIEWS OF THE OVAL CRICKET GROUND AND C...,SW8,SW
4,2495000,"Princes Gate, London, SW7 2QG",3,We are pleased to offer this exceptional 3 bed...,SW7,SW


In [13]:
df['Area'] = df['Area'].astype('category')

print(df.dtypes)

Price             int64
Address          object
Bedrooms          int64
Description      object
Postcode       category
Area           category
dtype: object


In [14]:
area_count = df['Area'].value_counts()

print(area_count)

SW    53
N     26
SE    24
NW    21
W     21
E     16
HA    14
TW     8
UB     8
EN     5
PE     1
TN     1
WC     1
Name: Area, dtype: int64


In [15]:
subset = df[['Price', 'Bedrooms', 'Area']]

In [16]:
subset.head()

Unnamed: 0,Price,Bedrooms,Area
0,11251000,5,SW
1,6500000,5,NW
2,3500000,3,W
3,2571000,3,SW
4,2495000,3,SW


In [40]:
X = subset[['Bedrooms', 'Area']]
y = subset['Price']

In [41]:
ct = ColumnTransformer([('encoder', OneHotEncoder(), ['Area'])], remainder='passthrough')
X = ct.fit_transform(X)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# lr_model = LinearRegression()
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [44]:
# lr_model.fit(X_train, y_train)
rf_regressor.fit(X_train, y_train)

In [45]:
# print('Training Accuracy:', lr_model.score(X_train, y_train))
# print('Testing Accuracy:', lr_model.score(X_test, y_test))
y_pred = rf_regressor.predict(X_test)

In [46]:
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: -0.966191451612672


In [None]:
# The data was used to train both a linear regression and random forest model and both performed extremely poorly.
# This is likely due to the small size of the dataset and also the lack of features. In order to train a more accurate model,
# more information such as number of bathrooms, square footage of the property, whether it has a garage etc would be needed.