In [25]:
import math
import numpy as np
import pandas as pd

Problem 4

(a)

In [17]:
# Build language matrix A and compute A A^T and A^T A
languages = ["English", "French", "German", "Italian", "Spanish"]
people = ["Anton", "Geraldine", "James", "Lauren"]

# Rows: people, Cols: languages
# Anton: French, German
# Geraldine: English, French, Italian
# James: English, Italian, Spanish
# Lauren: all others except French => English, German, Italian, Spanish
A = np.array([
  [0, 1, 1, 0, 0],  # Anton
  [1, 1, 0, 1, 0],  # Geraldine
  [1, 0, 0, 1, 1],  # James
  [1, 0, 1, 1, 1],  # Lauren
], dtype=int)

AAT = A @ A.T
ATA = A.T @ A

A, AAT, ATA


(array([[0, 1, 1, 0, 0],
        [1, 1, 0, 1, 0],
        [1, 0, 0, 1, 1],
        [1, 0, 1, 1, 1]]),
 array([[2, 1, 0, 1],
        [1, 3, 2, 2],
        [0, 2, 3, 3],
        [1, 2, 3, 4]]),
 array([[3, 1, 1, 3, 2],
        [1, 2, 1, 1, 0],
        [1, 1, 2, 1, 1],
        [3, 1, 1, 3, 2],
        [2, 0, 1, 2, 2]]))

(c)

In [22]:
# Generate random 3x3 integer matrices, invert if possible, and check A @ inv(A)
def random_int_matrix(rng, n=3, low=-10, high=10):
  return rng.integers(low, high + 1, size=(n, n))

rng = np.random.default_rng(0)

matrices = [random_int_matrix(rng) for _ in range(3)]

for i, mat in enumerate(matrices, start=1):
  print(f"Matrix {i}:\n{mat}\n")
  det = np.linalg.det(mat)

  if abs(det) < 1e-10:
    print("Inverse does not exist (determinant is ~0).\n")
    print("-" * 50)
    continue

  inv_mat = np.linalg.inv(mat)
  check_I = mat @ inv_mat

  print(f"Inverse {i}:\n{inv_mat}\n")
  print(f"Matrix {i} * Inverse {i}:\n{check_I}\n")
  print("-" * 50)


Matrix 1:
[[  7   3   0]
 [ -5  -4 -10]
 [ -9 -10  -7]]

Inverse 1:
[[ 0.21238938 -0.0619469   0.08849558]
 [-0.16224189  0.14454277 -0.20648968]
 [-0.04129794 -0.12684366  0.03834808]]

Matrix 1 * Inverse 1:
[[ 1.00000000e+00 -1.11022302e-16  1.11022302e-16]
 [ 1.52655666e-16  1.00000000e+00 -1.38777878e-17]
 [-1.59594560e-16  0.00000000e+00  1.00000000e+00]]

--------------------------------------------------
Matrix 2:
[[ 7  3  9]
 [ 0  2 10]
 [ 5  3  1]]

Inverse 2:
[[ 0.20588235 -0.17647059 -0.08823529]
 [-0.36764706  0.27941176  0.51470588]
 [ 0.07352941  0.04411765 -0.10294118]]

Matrix 2 * Inverse 2:
[[ 1.00000000e+00 -5.55111512e-17 -2.77555756e-17]
 [ 5.55111512e-17  1.00000000e+00 -5.55111512e-17]
 [ 2.49800181e-16 -1.66533454e-16  1.00000000e+00]]

--------------------------------------------------
Matrix 3:
[[  1   9  -5]
 [  7   4 -10]
 [ -2   8   1]]

Inverse 3:
[[-0.70588235  0.41176471  0.58823529]
 [-0.1092437   0.07563025  0.21008403]
 [-0.53781513  0.21848739  0.4957

Problem 5

In [26]:
# Load dataset and drop ignored columns
df = pd.read_csv("kc_house_data.csv")

drop_cols = ["id", "date", "zipcode"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

df.head()


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503


(a)

In [27]:
# Compute mean/min/max/variance for each feature (excluding response)
y = df["price"]
X = df.drop(columns=["price"])

summary = pd.DataFrame({
  "mean": X.mean(numeric_only=True),
  "min": X.min(numeric_only=True),
  "max": X.max(numeric_only=True),
  "variance": X.var(numeric_only=True, ddof=1)
})

summary


Unnamed: 0,mean,min,max,variance
bedrooms,3.370842,0.0,33.0,0.865015
bathrooms,2.114757,0.0,8.0,0.5931513
sqft_living,2079.899736,290.0,13540.0,843533.7
sqft_lot,15106.967566,520.0,1651359.0,1715659000.0
floors,1.494309,1.0,3.5,0.291588
waterfront,0.007542,0.0,1.0,0.007485226
view,0.234303,0.0,4.0,0.5872426
condition,3.40943,1.0,5.0,0.4234665
grade,7.656873,1.0,13.0,1.381703
sqft_above,1788.390691,290.0,9410.0,685734.7


In [28]:
# Find features with lowest/highest mean and variance
lowest_mean_feature = summary["mean"].idxmin()
highest_mean_feature = summary["mean"].idxmax()
lowest_var_feature = summary["variance"].idxmin()
highest_var_feature = summary["variance"].idxmax()

lowest_mean_feature, summary.loc[lowest_mean_feature, "mean"], \
highest_mean_feature, summary.loc[highest_mean_feature, "mean"], \
lowest_var_feature, summary.loc[lowest_var_feature, "variance"], \
highest_var_feature, summary.loc[highest_var_feature, "variance"]


('long',
 -122.21389640494147,
 'sqft_lot',
 15106.967565816869,
 'waterfront',
 0.007485225502689098,
 'sqft_lot',
 1715658774.1754544)

(b)

In [29]:
# Compute correlation of each feature with the response (price)
corr = X.corrwith(y)
corr_table = corr.to_frame(name="corr_with_price").sort_values("corr_with_price", ascending=False)

corr_table


Unnamed: 0,corr_with_price
sqft_living,0.702035
grade,0.667434
sqft_above,0.605567
sqft_living15,0.585379
bathrooms,0.525138
view,0.397293
sqft_basement,0.323816
bedrooms,0.30835
lat,0.307003
waterfront,0.266369


In [30]:
# List positively correlated features and find the highest positive correlation
positive_corr = corr_table[corr_table["corr_with_price"] > 0]

positive_feature_names = list(positive_corr.index)
top_positive_feature = positive_corr.index[0]
top_positive_value = float(positive_corr.iloc[0, 0])

positive_feature_names[:10], top_positive_feature, top_positive_value


(['sqft_living',
  'grade',
  'sqft_above',
  'sqft_living15',
  'bathrooms',
  'view',
  'sqft_basement',
  'bedrooms',
  'lat',
  'waterfront'],
 'sqft_living',
 0.7020350546118004)

(c)

In [31]:
# Find negatively correlated features (if any)
negative_corr = corr_table[corr_table["corr_with_price"] < 0]
negative_feature_names = list(negative_corr.index)

negative_corr, negative_feature_names


(Empty DataFrame
 Columns: [corr_with_price]
 Index: [],
 [])