In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing

# 1. Data Loading and Cleaning
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['PRICE'] = housing.target

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# 2. String Manipulation
# Clean column names (convert to lowercase and remove spaces)
df.columns = df.columns.str.lower().str.replace(' ', '_')

# 3. Use NumPy
# Convert 'MedInc' (median income) column to NumPy array and calculate statistics
income_array = df['medinc'].values
print("\nMedian Income statistics:")
print(f"Mean: {np.mean(income_array):.2f}")
print(f"Median: {np.median(income_array):.2f}")

# 4. Data Splitting
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build a Model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# Feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.coef_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature importance:")
print(feature_importance)

Missing values:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
PRICE         0
dtype: int64

Median Income statistics:
Mean: 3.87
Median: 3.53

Model Evaluation:
Mean Squared Error: 0.56
R-squared Score: 0.58

Feature importance:
      feature  importance
3   avebedrms    0.783145
0      medinc    0.448675
1    houseage    0.009724
4  population   -0.000002
5    aveoccup   -0.003526
2    averooms   -0.123323
6    latitude   -0.419792
7   longitude   -0.433708
