In [None]:
# importing neccessary libraries

In [None]:
import pandas as pd

In [None]:
# loading the dataset

In [None]:
df = pd.read_csv("UHI_d_NZFL.csv")

In [None]:
# understanding features

In [None]:
df.sample(5)

In [None]:
# removing the unnecessary columns

In [None]:
new_df = df[['GnPR',
    'treeDensity',
    'park_grass_ratio',
    'parcel_grass_ratio',
    'greenroof_ratio',
    'roadDensity',
    'bldDensity',
    'avg_BH',
    'avg_GPR','UHI_d']]

In [None]:
new_df

In [None]:
# identifying missing values

In [None]:
new_df.isnull().sum()

In [None]:
new_df.info()

In [None]:
# identifying and removing duplicate values

In [None]:
new_df.duplicated().sum()

In [None]:
print("Before:", new_df.shape)

new_df = new_df.drop_duplicates()

print("After:", new_df.shape)

In [None]:
new_df.duplicated().sum()

In [None]:
# identifying outliers using boxplot

In [None]:
import matplotlib.pyplot as plt

columns = [
    'GnPR',
    'treeDensity',
    'park_grass_ratio',
    'parcel_grass_ratio',
    'greenroof_ratio',
    'roadDensity',
    'bldDensity',
    'avg_BH',
    'avg_GPR'
]

# Create boxplots
for col in columns:
    plt.figure(figsize=(6,4))
    plt.boxplot(new_df[col])
    plt.title(f'Box Plot for {col}')
    plt.ylabel(col)
    plt.show()

In [None]:
# removing outliers in the independent features

In [None]:
# List of columns
columns = [
    'GnPR',
    'treeDensity',
    'park_grass_ratio',
    'parcel_grass_ratio',
    'greenroof_ratio',
    'roadDensity',
    'bldDensity',
    'avg_BH',
    'avg_GPR'
]

# Create copy to avoid modifying original dataset
df_clean = new_df.copy()

# Function to remove outliers using IQR
for col in columns:
    
    # Calculate Q1 and Q3
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Define lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers
    df_clean = df_clean[
        (df_clean[col] >= lower_bound) & 
        (df_clean[col] <= upper_bound)
    ]

# Check shape before and after
print("Original shape:", new_df.shape)
print("Cleaned shape:", df_clean.shape)

In [None]:
# creating new preprocessed dataset

In [None]:
df_clean.to_csv("final_dataset.csv",index= False)

In [None]:
# Features (independent variables)
X = df_clean.drop("UHI_d", axis=1)

# Target (dependent variable)
y = df_clean["UHI_d"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 80% training, 20% testing
    random_state=42
)

In [32]:
# Import Linear Regression model from sklearn

In [None]:
from sklearn.linear_model import LinearRegression

In [33]:
# Create (initialize) the Linear Regression model object

In [None]:
lr_model = LinearRegression()

In [42]:
# Train the model using training data
# X_train → input features
# y_train → target values (temperature / UHI etc.)

In [38]:
lr_model.fit(X_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [43]:
# Print the intercept of the regression line
# Print slope (coefficient) for each feature

In [39]:
print("Intercept:", lr_model.intercept_)
print("Slopes (Coefficients):", lr_model.coef_)

Intercept: 0.336834936852713
Slopes (Coefficients): [-0.20760484  0.00030221 -0.27845217 -0.19758797 -0.00417347 -0.05925563
  0.00660518  0.00119564  0.04227562]


In [45]:
# Create a DataFrame to clearly show feature names and their slopes
# Extract the slope (coefficient) specifically for greenery feature

In [40]:
import pandas as pd

coeff_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Slope": lr_model.coef_
})

print(coeff_df)

              Feature     Slope
0                GnPR -0.207605
1         treeDensity  0.000302
2    park_grass_ratio -0.278452
3  parcel_grass_ratio -0.197588
4     greenroof_ratio -0.004173
5         roadDensity -0.059256
6          bldDensity  0.006605
7              avg_BH  0.001196
8             avg_GPR  0.042276


In [46]:
# Calculate temperature change for 10% increase in greenery

In [41]:
print(f"For every 10% increase in greenery, the temperature changes by {abs(temp_change_10_percent):.4f} °C")

if temp_change_10_percent < 0:
    print(f"This means temperature DECREASES by {abs(temp_change_10_percent):.4f} °C for every 10% increase in greenery.")
else:
    print(f"This means temperature INCREASES by {temp_change_10_percent:.4f} °C for every 10% increase in greenery.")

For every 10% increase in greenery, the temperature changes by 0.4228 °C
This means temperature INCREASES by 0.4228 °C for every 10% increase in greenery.
