## Data preprocessing and analysis PART I

### Libraries:

In [93]:
import matplotlib.pyplot as plt
import pandas as pd
import calendar
from scipy import stats
import numpy as np
import math
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import calplot
import json
import geopandas as gpd
import statistics
from statsmodels.api import OLS, add_constant
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, NumeralTickFormatter, Legend, LegendItem
from bokeh.palettes import Category10
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColorBar, ColumnDataSource, LinearColorMapper, BasicTicker, PrintfTickFormatter
from bokeh.transform import transform
from bokeh.layouts import row, column
from bokeh.palettes import Greens 

### Loading housing data:
https://data.cambridgeshireinsight.org.uk/dataset/house-sales-and-prices

In [49]:
df1 = pd.read_csv('../data/ons-existing-homes-count-corrected_0.csv').sort_values('Area')
df2 = pd.read_csv('../data/ons-existing-homes-median-price-corrected_0_0.csv').sort_values('Area')
df3 = pd.read_csv('../data/ons-new-build-count-corrected_0.csv').sort_values('Area')
df4 = pd.read_csv('../data/ons-new-build-median-price-corrected_0.csv').sort_values('Area')

In [50]:
df2

Unnamed: 0,ONS code,Area,Q4-1995,Q1-1996,Q2-1996,Q3-1996,Q4-1996,Q1-1997,Q2-1997,Q3-1997,...,Q3-2013,Q4-2013,Q1-2014,Q2-2014,Q3-2014,Q4-2014,Q1-2015,Q2-2015,Q3-2015,Q4-2015
1,E07000008,Cambridge,69500,69950,70000,72000,72500,73000,75000,78500,...,290000,295000,295000,304075,315000,327350,337000,350000,370000,380000
2,E07000009,East Cambridgeshire,56500,56500,57000,57000,57500,58000,58500,59000,...,196011,195750,197000,200000,204050,215000,220000,226000,229950,240000
3,E07000010,Fenland,40000,40000,40000,40000,40000,40000,41500,43000,...,127000,130000,135000,137000,139995,140000,141000,145000,147500,150000
6,E07000201,Forest Heath,49000,49000,48000,47975,48000,49000,50000,52500,...,149999,150000,151000,155000,160000,164950,168500,170000,175000,179500
4,E07000011,Huntingdonshire,54000,54000,54000,54625,54875,55000,56000,57000,...,177000,178000,179995,184500,190000,195000,198000,200000,210000,215000
0,E06000031,Peterborough,40500,40000,40000,40000,41000,41500,42000,43000,...,125000,125000,129250,132750,135100,138000,140000,143500,145000,148000
5,E07000012,South Cambridgeshire,69950,70000,70000,73125,74000,74000,75500,77500,...,250000,250000,250000,250000,259875,266500,275000,284000,300000,315000
7,E07000204,St Edmundsbury,53500,53500,53625,54000,54995,55000,56625,57000,...,186000,190000,192500,196000,208000,215000,215000,225000,227748,231000


# Data preprocessing

Since there are only data from december 1995 we remove all data from 2015 and thery only have data from 1996-2015

In [51]:
df1.drop(columns="Q4-1995", inplace=True)
df2.drop(columns="Q4-1995", inplace=True)
df3.drop(columns="Q4-1995", inplace=True)
df4.drop(columns="Q4-1995", inplace=True)

According to wikipedia: "St Edmundsbury and Forest Heath were merged on 1 April 2019 to form the new West Suffolk district"
https://en.wikipedia.org/wiki/Borough_of_St_Edmundsbury \
Therfore we merge the data for these two areas in order to compare over the years. 
We add the amount of houses sold in data 1 and 3 and find the median price in data 2 and 4.

In [52]:
areas_to_merge = ['Forest Heath', 'St Edmundsbury']
new_area_name = 'West Suffolk'
new_ons_code = 'E07000245'

In [53]:
merge_df = df1[df1['Area'].isin(areas_to_merge)]
merged_row = merge_df.select_dtypes(include=[np.number]).sum()
merged_row_df = pd.DataFrame([merged_row], columns=merge_df.columns.drop(['ONS code', 'Area']))
merged_row_df['ONS code'] = new_ons_code
merged_row_df['Area'] = new_area_name

df1 = df1[~df1['Area'].isin(areas_to_merge)]
df1 = pd.concat([df1, merged_row_df], ignore_index=True)

df1

Unnamed: 0,ONS code,Area,Q1-1996,Q2-1996,Q3-1996,Q4-1996,Q1-1997,Q2-1997,Q3-1997,Q4-1997,...,Q3-2013,Q4-2013,Q1-2014,Q2-2014,Q3-2014,Q4-2014,Q1-2015,Q2-2015,Q3-2015,Q4-2015
0,E07000008,Cambridge,1438,1458,1546,1718,1829,1979,2095,2102,...,1293,1320,1383,1412,1366,1378,1352,1327,1345,1310
1,E07000009,East Cambridgeshire,940,964,1014,1139,1172,1268,1362,1396,...,1166,1264,1337,1423,1462,1448,1428,1405,1308,1312
2,E07000010,Fenland,1214,1262,1296,1389,1491,1610,1763,1875,...,1379,1439,1562,1649,1684,1708,1617,1563,1597,1660
3,E07000011,Huntingdonshire,2575,2713,2910,3157,3347,3443,3485,3364,...,2419,2699,2926,3076,3328,3296,3182,3118,2975,2938
4,E06000031,Peterborough,2212,2194,2356,2530,2628,2808,2921,3022,...,1927,2021,2162,2218,2407,2460,2428,2423,2453,2488
5,E07000012,South Cambridgeshire,1773,1789,1928,2172,2280,2442,2563,2519,...,1986,2054,2195,2311,2380,2416,2351,2231,2133,2151
6,E07000245,West Suffolk,2104,2204,2393,2638,2751,2972,3133,3182,...,2431,2579,2731,2890,3022,3114,2982,2881,2842,2895


In [54]:
merge_df = df3[df3['Area'].isin(areas_to_merge)]
merged_row = merge_df.select_dtypes(include=[np.number]).sum()
merged_row_df = pd.DataFrame([merged_row], columns=merge_df.columns.drop(['ONS code', 'Area']))
merged_row_df['ONS code'] = new_ons_code
merged_row_df['Area'] = new_area_name

df3 = df3[~df3['Area'].isin(areas_to_merge)]
df3 = pd.concat([df3, merged_row_df], ignore_index=True)
df3

Unnamed: 0,ONS code,Area,Q1-1996,Q2-1996,Q3-1996,Q4-1996,Q1-1997,Q2-1997,Q3-1997,Q4-1997,...,Q3-2013,Q4-2013,Q1-2014,Q2-2014,Q3-2014,Q4-2014,Q1-2015,Q2-2015,Q3-2015,Q4-2015
0,E07000008,Cambridge,169,140,169,198,240,239,230,176,...,595,669,724,703,606,580,530,457,363,255
1,E07000009,East Cambridgeshire,324,389,445,483,488,486,468,453,...,151,136,143,125,122,124,109,104,91,80
2,E07000010,Fenland,252,215,209,199,211,220,254,262,...,114,116,121,126,120,126,117,92,86,78
3,E07000011,Huntingdonshire,589,578,591,616,643,668,673,639,...,345,411,428,457,444,388,385,353,319,281
4,E06000031,Peterborough,266,278,328,365,370,413,411,428,...,485,561,626,651,669,644,614,562,572,501
5,E07000012,South Cambridgeshire,339,354,397,396,430,485,550,558,...,372,388,417,445,451,453,420,403,372,312
6,E07000245,West Suffolk,477,492,527,545,568,599,577,541,...,254,239,283,291,339,379,360,387,360,330


In [55]:
#merge 'Forest Heath' and 'St Edmundsbury' in data frames with prices - df2 and df4

def process_df(df, areas_to_merge, new_area_name, new_ons_code):
    merge_df = df[df['Area'].isin(areas_to_merge)]
    
    merged_row = merge_df.select_dtypes(include=[np.number]).mean()
    merged_row_df = pd.DataFrame([merged_row], columns=merge_df.columns.drop(['ONS code', 'Area']))
    merged_row_df['ONS code'] = new_ons_code
    merged_row_df['Area'] = new_area_name
    df = df[~df['Area'].isin(areas_to_merge)]
    return pd.concat([df, merged_row_df], ignore_index=True)

df2 = process_df(df2, areas_to_merge, new_area_name, new_ons_code)
df4 = process_df(df4, areas_to_merge, new_area_name, new_ons_code)

Reshaping data:

In [56]:
df1_flip = df1.melt(id_vars=['ONS code', 'Area'], var_name='Quarter', value_name='Houses Sold')
df2_flip = df2.melt(id_vars=['ONS code', 'Area'], var_name='Quarter', value_name='Price')
df3_flip = df3.melt(id_vars=['ONS code', 'Area'], var_name='Quarter', value_name='Houses Sold')
df4_flip = df4.melt(id_vars=['ONS code', 'Area'], var_name='Quarter', value_name='Price')

Extracting features:

In [57]:
# Extract year and quarter
df1_flip[['Quarter', 'Year']] = df1_flip['Quarter'].str.split('-', expand=True)
df2_flip[['Quarter', 'Year']] = df2_flip['Quarter'].str.split('-', expand=True)
df3_flip[['Quarter', 'Year']] = df3_flip['Quarter'].str.split('-', expand=True)
df4_flip[['Quarter', 'Year']] = df4_flip['Quarter'].str.split('-', expand=True)

# Convert Year to integer
df1_flip['Year'] = df1_flip['Year'].astype(int)
df2_flip['Year'] = df2_flip['Year'].astype(int)
df3_flip['Year'] = df3_flip['Year'].astype(int)
df4_flip['Year'] = df4_flip['Year'].astype(int)

In [58]:
# Combine data frames df1 and df2 for resale market
df_resale_market = pd.merge(df1_flip, df2_flip[['ONS code', 'Area', 'Quarter', 'Year', 'Price']],
                          on=['ONS code', 'Area', 'Quarter', 'Year'], how='left')
df_resale_market

Unnamed: 0,ONS code,Area,Quarter,Houses Sold,Year,Price
0,E07000008,Cambridge,Q1,1438,1996,69950.0
1,E07000009,East Cambridgeshire,Q1,940,1996,56500.0
2,E07000010,Fenland,Q1,1214,1996,40000.0
3,E07000011,Huntingdonshire,Q1,2575,1996,54000.0
4,E06000031,Peterborough,Q1,2212,1996,40000.0
...,...,...,...,...,...,...
555,E07000010,Fenland,Q4,1660,2015,150000.0
556,E07000011,Huntingdonshire,Q4,2938,2015,215000.0
557,E06000031,Peterborough,Q4,2488,2015,148000.0
558,E07000012,South Cambridgeshire,Q4,2151,2015,315000.0


In [59]:
# Combine data frames df3 and df4 for primary market
df_primary_market = pd.merge(df3_flip, df4_flip[['ONS code', 'Area', 'Quarter', 'Year', 'Price']],
                          on=['ONS code', 'Area', 'Quarter', 'Year'], how='left')
df_primary_market

Unnamed: 0,ONS code,Area,Quarter,Houses Sold,Year,Price
0,E07000008,Cambridge,Q1,169,1996,90000.0
1,E07000009,East Cambridgeshire,Q1,324,1996,69950.0
2,E07000010,Fenland,Q1,252,1996,47950.0
3,E07000011,Huntingdonshire,Q1,589,1996,83000.0
4,E06000031,Peterborough,Q1,266,1996,50000.0
...,...,...,...,...,...,...
555,E07000010,Fenland,Q4,78,2015,152500.0
556,E07000011,Huntingdonshire,Q4,281,2015,197000.0
557,E06000031,Peterborough,Q4,501,2015,179995.0
558,E07000012,South Cambridgeshire,Q4,312,2015,345000.0


# Price trends for all districts over the years

Below is an analysis of apartment prices on the primary and secondary markets for subsequent districts. In order to best compare prices and their changes between districts, charts were created for each district and then superimposed on one chart. Additionally, we used linear regression to check the average annual change for each district, which allowed us to predict the dynamics of changes in subsequent years.

### PRIMARY MARKET

### - for each quarter

In [60]:
df_primary_market['Year_Quarter'] = df_primary_market['Year'].astype(str) + ' ' + df_primary_market['Quarter']
df_primary_market.sort_values(by='Year_Quarter', inplace=True)

areas = df_primary_market['Area'].unique()


for area in areas:
    area_data = df_primary_market[df_primary_market['Area'] == area]
    
    # Extract only the year part from 'Year_Quarter' for ticks
    labels = area_data['Year_Quarter'].tolist()
    year_ticks = [label for index, label in enumerate(labels) if index % 4 == 0] 
    fig = px.line(area_data, x='Year_Quarter', y='Price', 
                  title=f'Price Trend for {area}', 
                  labels={'Year_Quarter': 'Year and Quarter', 'Price': 'Price'},
                  markers=True,
                  color_discrete_sequence=['green']) 

    fig.update_xaxes(tickvals=[labels.index(tick) for tick in year_ticks], ticktext=[tick.split()[0] for tick in year_ticks])
    
    fig.update_layout(
        xaxis_title='Year and Quarter',
        yaxis_title='Price',
        xaxis=dict(tickangle=45),
        template='plotly_white',
        autosize=True
    )
    
    fig.show()

In [61]:
# For each area, fit a linear regression model and forecast prices
for area in areas:
    area_data = df_primary_market[df_primary_market['Area'] == area]

    # Prepare data for regression
    area_data['Year_Index'] = area_data['Year'].astype(int) - area_data['Year'].astype(int).min()
    X = add_constant(area_data['Year_Index'])  # Adds a constant term to the predictor
    y = area_data['Price']

    # Fit the regression model
    model = OLS(y, X).fit()
    
    # Print summary of regression
    print(f"Regression summary for {area}:\n", model.summary())

    # forecast for the next 5 years
    forecast_years = pd.DataFrame({'Year_Index': range(area_data['Year_Index'].max() + 1, area_data['Year_Index'].max() + 6)})
    forecast_years = add_constant(forecast_years)  # Add the constant term
    predictions = model.predict(forecast_years)

    # create a DataFrame for plotting forecasted values
    forecasted_data = pd.DataFrame({
        'Year_Quarter': [str(year) + ' Q1' for year in range(area_data['Year'].max() + 1, area_data['Year'].max() + 6)],
        'Price': predictions
    })

    # combine historical and forecasted data
    combined_data = pd.concat([area_data, forecasted_data])

    # Plotting with Plotly
    fig = px.line(combined_data, x='Year_Quarter', y='Price', 
                  title=f'Price Trend and Forecast for {area}', 
                  labels={'Year_Quarter': 'Year and Quarter', 'Price': 'Price'},
                  markers=True,
                  color_discrete_sequence=['green'])

    fig.update_layout(
        xaxis_title='Year and Quarter',
        yaxis_title='Price',
        xaxis=dict(tickangle=45),
        template='plotly_white',
        autosize=True
    )

    fig.show()


Regression summary for Cambridge:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.772
Model:                            OLS   Adj. R-squared:                  0.769
Method:                 Least Squares   F-statistic:                     263.7
Date:                Tue, 07 May 2024   Prob (F-statistic):           9.77e-27
Time:                        01:18:39   Log-Likelihood:                -969.51
No. Observations:                  80   AIC:                             1943.
Df Residuals:                      78   BIC:                             1948.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       9.313



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for East Cambridgeshire:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.787
Model:                            OLS   Adj. R-squared:                  0.784
Method:                 Least Squares   F-statistic:                     288.3
Date:                Tue, 07 May 2024   Prob (F-statistic):           6.42e-28
Time:                        01:18:39   Log-Likelihood:                -915.26
No. Observations:                  80   AIC:                             1835.
Df Residuals:                      78   BIC:                             1839.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const  



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for Fenland:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.779
Model:                            OLS   Adj. R-squared:                  0.777
Method:                 Least Squares   F-statistic:                     275.6
Date:                Tue, 07 May 2024   Prob (F-statistic):           2.54e-27
Time:                        01:18:39   Log-Likelihood:                -894.68
No. Observations:                  80   AIC:                             1793.
Df Residuals:                      78   BIC:                             1798.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        5.94e+



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for Huntingdonshire:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.787
Model:                            OLS   Adj. R-squared:                  0.785
Method:                 Least Squares   F-statistic:                     288.8
Date:                Tue, 07 May 2024   Prob (F-statistic):           6.05e-28
Time:                        01:18:39   Log-Likelihood:                -904.65
No. Observations:                  80   AIC:                             1813.
Df Residuals:                      78   BIC:                             1818.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for Peterborough:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.772
Model:                            OLS   Adj. R-squared:                  0.769
Method:                 Least Squares   F-statistic:                     263.8
Date:                Tue, 07 May 2024   Prob (F-statistic):           9.63e-27
Time:                        01:18:39   Log-Likelihood:                -907.10
No. Observations:                  80   AIC:                             1818.
Df Residuals:                      78   BIC:                             1823.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       8.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for South Cambridgeshire:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.844
Model:                            OLS   Adj. R-squared:                  0.842
Method:                 Least Squares   F-statistic:                     421.8
Date:                Tue, 07 May 2024   Prob (F-statistic):           3.38e-33
Time:                        01:18:39   Log-Likelihood:                -912.75
No. Observations:                  80   AIC:                             1830.
Df Residuals:                      78   BIC:                             1834.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for West Suffolk:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.810
Model:                            OLS   Adj. R-squared:                  0.808
Method:                 Least Squares   F-statistic:                     332.5
Date:                Tue, 07 May 2024   Prob (F-statistic):           7.42e-30
Time:                        01:18:39   Log-Likelihood:                -905.86
No. Observations:                  80   AIC:                             1816.
Df Residuals:                      78   BIC:                             1820.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       8.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### - average for each year

In [62]:
df_primary_market['Year'] = df_primary_market['Year_Quarter'].str.split().str[0]

# Group by 'Year' and 'Area' to calculate average price per year per area
yearly_avg = df_primary_market.groupby(['Year', 'Area']).agg({'Price': 'mean'}).reset_index()
areas = yearly_avg['Area'].unique()

n_rows = 2
n_cols = 4
fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=areas)


plot_row, plot_col = 1, 1

# uniform Y-axis range based on max and min of average prices

max_price = yearly_avg['Price'].max()
min_price = yearly_avg['Price'].min()

for i, area in enumerate(areas, start=1):
    area_data = yearly_avg[yearly_avg['Area'] == area]
    trace = go.Scatter(x=area_data['Year'], y=area_data['Price'], 
                       mode='lines+markers', name=area, 
                       line=dict(color='green'))
    
    fig.add_trace(trace, row=plot_row, col=plot_col)
    
    plot_col += 1
    if plot_col > n_cols: 
        plot_row += 1
        plot_col = 1
    if plot_row > n_rows and plot_col > n_cols:
        break

fig.update_yaxes(range=[min_price, max_price], nticks=10)

fig.update_layout(height=800, width=1200, title_text="Yearly Average Price Trends by District",
                  showlegend=False, template='plotly_white', title_x=0.5)

fig.show()

#Saving to HTML
fig.write_html('full_figure.html')


To have a general picture of prices and their changes over the years, price trends for all districts are presented. In the case of the 'Cambridge' district, price changes are the most rapid, and the smallest differences can be observed in the case of the 'Fenland' district.

In [63]:
df_primary_market['Year'] = df_primary_market['Year_Quarter'].str.split().str[0]
yearly_avg = df_primary_market.groupby(['Year', 'Area']).agg({'Price': 'mean'}).reset_index()
areas = yearly_avg['Area'].unique()

output_notebook() 
output_file("district_price_trends_primary_market.html") 

p = figure(title="Yearly Average Price Trends by District on the Primary Market", x_axis_label='Year', y_axis_label='Price',
           tools="pan,wheel_zoom,box_zoom,reset,save", width=1000, height=400)

p.title.align = 'center' 
colors = Category10[10] 

legend_items = []

for i, area in enumerate(areas):
    color = colors[i % len(colors)] 
    area_data = yearly_avg[yearly_avg['Area'] == area]
    source = ColumnDataSource(area_data)
    line = p.line(x='Year', y='Price', line_width=2, color=color, source=source)
    
    circle = p.circle(x='Year', y='Price', fill_color=color, line_color=color, size=8, source=source)

    legend_items.append(LegendItem(label=area, renderers=[line, circle]))

p.yaxis.formatter = NumeralTickFormatter(format="0a") 

custom_legend = Legend(items=legend_items, location="top_left")
p.add_layout(custom_legend, 'right')
p.legend.click_policy="hide" 

show(p) 


### RESALE MARKET 

In [64]:
df_resale_market['Year_Quarter'] = df_resale_market['Year'].astype(str) + ' ' + df_resale_market['Quarter']
df_resale_market.sort_values(by='Year_Quarter', inplace=True)

areas = df_resale_market['Area'].unique()


for area in areas:
    area_data = df_primary_market[df_primary_market['Area'] == area]
    
    # Extract only the year part from 'Year_Quarter' for ticks
    labels = area_data['Year_Quarter'].tolist()
    year_ticks = [label for index, label in enumerate(labels) if index % 4 == 0] 
    fig = px.line(area_data, x='Year_Quarter', y='Price', 
                  title=f'Price Trend for {area}', 
                  labels={'Year_Quarter': 'Year and Quarter', 'Price': 'Price'},
                  markers=True,
                  color_discrete_sequence=['green']) 

    fig.update_xaxes(tickvals=[labels.index(tick) for tick in year_ticks], ticktext=[tick.split()[0] for tick in year_ticks])
    
    fig.update_layout(
        xaxis_title='Year and Quarter',
        yaxis_title='Price',
        xaxis=dict(tickangle=45),
        template='plotly_white',
        autosize=True
    )
    
    fig.show()

In [65]:
# For each area, fit a linear regression model and forecast prices
for area in areas:
    area_data = df_resale_market[df_resale_market['Area'] == area]

    # prepare data for regression
    area_data['Year_Index'] = area_data['Year'].astype(int) - area_data['Year'].astype(int).min()
    X = add_constant(area_data['Year_Index']) 
    y = area_data['Price']

    # fit the regression model
    model = OLS(y, X).fit()
    print(f"Regression summary for {area}:\n", model.summary())

    # forecast for the next 5 years
    forecast_years = pd.DataFrame({'Year_Index': range(area_data['Year_Index'].max() + 1, area_data['Year_Index'].max() + 6)})
    forecast_years = add_constant(forecast_years) 
    predictions = model.predict(forecast_years)

    # create a DataFrame for plotting forecasted values
    forecasted_data = pd.DataFrame({
        'Year_Quarter': [str(year) + ' Q1' for year in range(area_data['Year'].max() + 1, area_data['Year'].max() + 6)],
        'Price': predictions
    })

    # combine historical and forecasted data
    combined_data = pd.concat([area_data, forecasted_data])

    # Plotting with Plotly
    fig = px.line(combined_data, x='Year_Quarter', y='Price', 
                  title=f'Price Trend and Forecast for {area}', 
                  labels={'Year_Quarter': 'Year and Quarter', 'Price': 'Price'},
                  markers=True,
                  color_discrete_sequence=['green'])

    fig.update_layout(
        xaxis_title='Year and Quarter',
        yaxis_title='Price',
        xaxis=dict(tickangle=45),
        template='plotly_white',
        autosize=True
    )

    fig.show()


Regression summary for Cambridge:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.964
Model:                            OLS   Adj. R-squared:                  0.963
Method:                 Least Squares   F-statistic:                     2083.
Date:                Tue, 07 May 2024   Prob (F-statistic):           5.00e-58
Time:                        01:18:42   Log-Likelihood:                -884.89
No. Observations:                  80   AIC:                             1774.
Df Residuals:                      78   BIC:                             1779.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        6.87



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for East Cambridgeshire:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.913
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     813.6
Date:                Tue, 07 May 2024   Prob (F-statistic):           5.12e-43
Time:                        01:18:42   Log-Likelihood:                -886.78
No. Observations:                  80   AIC:                             1778.
Df Residuals:                      78   BIC:                             1782.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const  



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for Fenland:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.832
Model:                            OLS   Adj. R-squared:                  0.830
Method:                 Least Squares   F-statistic:                     387.2
Date:                Tue, 07 May 2024   Prob (F-statistic):           5.56e-32
Time:                        01:18:42   Log-Likelihood:                -886.39
No. Observations:                  80   AIC:                             1777.
Df Residuals:                      78   BIC:                             1782.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.374e+



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for Huntingdonshire:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.895
Model:                            OLS   Adj. R-squared:                  0.893
Method:                 Least Squares   F-statistic:                     662.4
Date:                Tue, 07 May 2024   Prob (F-statistic):           7.24e-40
Time:                        01:18:42   Log-Likelihood:                -889.85
No. Observations:                  80   AIC:                             1784.
Df Residuals:                      78   BIC:                             1788.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for Peterborough:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.843
Model:                            OLS   Adj. R-squared:                  0.841
Method:                 Least Squares   F-statistic:                     419.2
Date:                Tue, 07 May 2024   Prob (F-statistic):           4.16e-33
Time:                        01:18:42   Log-Likelihood:                -881.94
No. Observations:                  80   AIC:                             1768.
Df Residuals:                      78   BIC:                             1773.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for South Cambridgeshire:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.932
Method:                 Least Squares   F-statistic:                     1090.
Date:                Tue, 07 May 2024   Prob (F-statistic):           1.36e-47
Time:                        01:18:42   Log-Likelihood:                -894.20
No. Observations:                  80   AIC:                             1792.
Df Residuals:                      78   BIC:                             1797.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Regression summary for West Suffolk:
                             OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.886
Model:                            OLS   Adj. R-squared:                  0.885
Method:                 Least Squares   F-statistic:                     608.7
Date:                Tue, 07 May 2024   Prob (F-statistic):           1.38e-38
Time:                        01:18:42   Log-Likelihood:                -888.89
No. Observations:                  80   AIC:                             1782.
Df Residuals:                      78   BIC:                             1787.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [66]:
df_resale_market['Year'] = df_resale_market['Year_Quarter'].str.split().str[0]
yearly_avg = df_resale_market.groupby(['Year', 'Area']).agg({'Price': 'mean'}).reset_index()
areas = yearly_avg['Area'].unique()

output_notebook() 
output_file("district_price_trends_resale_market.html") 

p = figure(title="Yearly Average Price Trends by District on the Resale Market", x_axis_label='Year', y_axis_label='Price',
           tools="pan,wheel_zoom,box_zoom,reset,save", width=1000, height=400)

p.title.align = 'center' 
colors = Category10[10] 

legend_items = []

for i, area in enumerate(areas):
    color = colors[i % len(colors)] 
    area_data = yearly_avg[yearly_avg['Area'] == area]
    source = ColumnDataSource(area_data)
    line = p.line(x='Year', y='Price', line_width=2, color=color, source=source)
    
    circle = p.circle(x='Year', y='Price', fill_color=color, line_color=color, size=8, source=source)

    legend_items.append(LegendItem(label=area, renderers=[line, circle]))

p.yaxis.formatter = NumeralTickFormatter(format="0a") 

custom_legend = Legend(items=legend_items, location="top_left")
p.add_layout(custom_legend, 'right')
p.legend.click_policy="hide" 

show(p) 

The lines show a clear upward trend in prices over these two decades, indicating that houses in all districts have generally become more expensive over time.

### DEEP DIVE - FENLAND DISTRICT

Our special attention was drawn to the Fenland district, where the median price of apartments on the primary market was the lowest in all recorded years and the growth rate was the lowest. However, on the secondary market, the median apartment price is no longer the lowest, and the growth dynamics examined using linear regression turned out to be higher than in the case of the Peterborough district. Buying a flat in Fenland may be a good investment if your budget is not the highest and you want to be the first owner of the property.

In [28]:
# filter data for Fenland area
fenland_primary = df_primary_market[df_primary_market['Area'] == 'Fenland']
fenland_resale = df_resale_market[df_resale_market['Area'] == 'Fenland']

In [67]:
fenland_primary

Unnamed: 0,ONS code,Area,Quarter,Houses Sold,Year,Price,Year_Quarter
2,E07000010,Fenland,Q1,252,1996,47950.0,1996 Q1
9,E07000010,Fenland,Q2,215,1996,47950.0,1996 Q2
16,E07000010,Fenland,Q3,209,1996,47950.0,1996 Q3
23,E07000010,Fenland,Q4,199,1996,47950.0,1996 Q4
30,E07000010,Fenland,Q1,211,1997,47950.0,1997 Q1
...,...,...,...,...,...,...,...
527,E07000010,Fenland,Q4,126,2014,149995.0,2014 Q4
534,E07000010,Fenland,Q1,117,2015,149995.0,2015 Q1
541,E07000010,Fenland,Q2,92,2015,153998.0,2015 Q2
548,E07000010,Fenland,Q3,86,2015,157250.0,2015 Q3


In [68]:
fenland_resale

Unnamed: 0,ONS code,Area,Quarter,Houses Sold,Year,Price,Year_Quarter
2,E07000010,Fenland,Q1,1214,1996,40000.0,1996 Q1
9,E07000010,Fenland,Q2,1262,1996,40000.0,1996 Q2
16,E07000010,Fenland,Q3,1296,1996,40000.0,1996 Q3
23,E07000010,Fenland,Q4,1389,1996,40000.0,1996 Q4
30,E07000010,Fenland,Q1,1491,1997,40000.0,1997 Q1
...,...,...,...,...,...,...,...
527,E07000010,Fenland,Q4,1708,2014,140000.0,2014 Q4
534,E07000010,Fenland,Q1,1617,2015,141000.0,2015 Q1
541,E07000010,Fenland,Q2,1563,2015,145000.0,2015 Q2
548,E07000010,Fenland,Q3,1597,2015,147500.0,2015 Q3


In [94]:
output_notebook()
output_file("fenland.html")

def prepare_data(df):
    df['Year'] = df['Year_Quarter'].apply(lambda x: x.split()[0])
    df['Quarter'] = df['Year_Quarter'].apply(lambda x: x.split()[1])
    pivot = df.pivot_table(values='Price', index='Year', columns='Quarter', aggfunc='mean')
    
    # Skalowanie cen w ramach roku
    scaled_data = pivot.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=1).stack().reset_index()
    scaled_data.columns = ['Year', 'Quarter', 'Scaled_Price']
    return scaled_data

data_primary = prepare_data(fenland_primary)
data_resale = prepare_data(fenland_resale)

# Tworzenie źródła danych dla Bokeh
source_primary = ColumnDataSource(data_primary)
source_resale = ColumnDataSource(data_resale)

# Definicja mapy kolorów
def get_color_mapper():
    palette = Greens[256]  # Użycie pełnej palety odcieni zieleni
    mapper = LinearColorMapper(palette=palette[::-1], low=0, high=1)  # Odwrócenie palety
    return mapper, palette

color_mapper, palette = get_color_mapper()

def create_heatmap(source, title):
    unique_years = sorted(list(set(source.data['Year'])))
    p = figure(title=title, 
               x_range=unique_years, 
               y_range=list(reversed(['Q1', 'Q2', 'Q3', 'Q4'])),
               x_axis_label='Year', y_axis_label='Quarter', 
               tools="", toolbar_location=None,
               width=800, height=200)
    p.title.align = 'center' 
    p.rect(x='Year', y='Quarter', width=1, height=1, source=source,
           line_color=None, fill_color=transform('Scaled_Price', color_mapper))
    
    color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, location=(0,0),
                         title='Price Level', orientation='vertical',
                         ticker=BasicTicker(desired_num_ticks=2),
                         formatter=PrintfTickFormatter(format=''),
                         major_label_overrides={0: 'Lowest Price', 1: 'Highest Price'})
    p.add_layout(color_bar, 'right')

    p.yaxis.major_label_orientation = "vertical"
    p.grid.grid_line_color = None
    return p

heatmap_primary = create_heatmap(source_primary, "Primary Market Prices for Fenland - the most expensive/cheap quarter")
heatmap_resale = create_heatmap(source_resale, "Resale Market Prices for Fenland - the most expensive/cheap quarter")

show(column(heatmap_primary, heatmap_resale))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/