In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# –ù–∞–ª–∞—à—Ç—É–≤–∞–Ω–Ω—è —Å—Ç–∏–ª—é –≥—Ä–∞—Ñ—ñ–∫—ñ–≤
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("üè† –ê–Ω–∞–ª—ñ–∑ —Ü—ñ–Ω –Ω–∞ –Ω–µ—Ä—É—Ö–æ–º—ñ—Å—Ç—å –∑–∞ –¥–æ–ø–æ–º–æ–≥–æ—é Lasso Regression")
print("=" * 60)

# –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è –¥–∞–Ω–∏—Ö –ø—Ä–æ –Ω–µ—Ä—É—Ö–æ–º—ñ—Å—Ç—å –≤ –ö–∞–ª—ñ—Ñ–æ—Ä–Ω—ñ—ó
housing = fetch_california_housing()
X, y = housing.data, housing.target

# –°—Ç–≤–æ—Ä—é—î–º–æ DataFrame –¥–ª—è –∑—Ä—É—á–Ω–æ—Å—Ç—ñ
feature_names = housing.feature_names
df = pd.DataFrame(X, columns=feature_names)
df['Price'] = y

print(f"üìä –†–æ–∑–º—ñ—Ä –¥–∞—Ç–∞—Å–µ—Ç—É: {df.shape}")
print(f"üìã –û–∑–Ω–∞–∫–∏: {list(feature_names)}")
print(f"üìà –°–µ—Ä–µ–¥–Ω—è —Ü—ñ–Ω–∞: ${np.mean(y):.2f} (–≤ —Å–æ—Ç–Ω—è—Ö —Ç–∏—Å—è—á)")

# –°—Ç–≤–æ—Ä—é—î–º–æ –¥–æ–¥–∞—Ç–∫–æ–≤—ñ –æ–∑–Ω–∞–∫–∏ (polynomial features) –¥–ª—è –¥–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü—ñ—ó —Å–µ–ª–µ–∫—Ü—ñ—ó
np.random.seed(42)
n_samples, n_features = X.shape

# –î–æ–¥–∞—î–º–æ –≤–∑–∞—î–º–æ–¥—ñ—ó –º—ñ–∂ –æ–∑–Ω–∞–∫–∞–º–∏ —Ç–∞ –∫–≤–∞–¥—Ä–∞—Ç–∏—á–Ω—ñ —Ç–µ—Ä–º–∏
interactions = []
interaction_names = []

for i in range(n_features):
    for j in range(i+1, n_features):
        interactions.append((X[:, i] * X[:, j]).reshape(-1, 1))
        interaction_names.append(f"{feature_names[i]} √ó {feature_names[j]}")

# –ö–≤–∞–¥—Ä–∞—Ç–∏—á–Ω—ñ —Ç–µ—Ä–º–∏
for i in range(n_features):
    interactions.append((X[:, i] ** 2).reshape(-1, 1))
    interaction_names.append(f"{feature_names[i]}¬≤")

# –î–æ–¥–∞—î–º–æ —à—É–º–Ω—ñ –æ–∑–Ω–∞–∫–∏ (–Ω–µ—Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ñ)
noise_features = []
noise_names = []
for i in range(10):
    noise_features.append(np.random.normal(0, 1, (n_samples, 1)))
    noise_names.append(f"Noise_{i+1}")

# –û–±'—î–¥–Ω—É—î–º–æ –≤—Å—ñ –æ–∑–Ω–∞–∫–∏
X_extended = np.hstack([X] + interactions + noise_features)
all_feature_names = list(feature_names) + interaction_names + noise_names

print(f"üîß –†–æ–∑—à–∏—Ä–µ–Ω–∏–π –Ω–∞–±—ñ—Ä –æ–∑–Ω–∞–∫: {X_extended.shape[1]} –æ–∑–Ω–∞–∫")

# –†–æ–∑–¥—ñ–ª—è—î–º–æ –¥–∞–Ω—ñ
X_train, X_test, y_train, y_test = train_test_split(
    X_extended, y, test_size=0.2, random_state=42
)

# –°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü—ñ—è
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Cross-validation –¥–ª—è –≤–∏–±–æ—Ä—É –æ–ø—Ç–∏–º–∞–ª—å–Ω–æ–≥–æ alpha
print("\nüîç –ü–æ—à—É–∫ –æ–ø—Ç–∏–º–∞–ª—å–Ω–æ–≥–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞ —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü—ñ—ó...")
alphas = np.logspace(-4, 1, 50)
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=42, max_iter=2000)
lasso_cv.fit(X_train_scaled, y_train)

optimal_alpha = lasso_cv.alpha_
print(f"üéØ –û–ø—Ç–∏–º–∞–ª—å–Ω–∏–π alpha: {optimal_alpha:.6f}")

# 2. –¢—Ä–µ–Ω—É–≤–∞–Ω–Ω—è –º–æ–¥–µ–ª–µ–π
# Lasso –∑ –æ–ø—Ç–∏–º–∞–ª—å–Ω–∏–º alpha
lasso = Lasso(alpha=optimal_alpha, max_iter=2000, random_state=42)
lasso.fit(X_train_scaled, y_train)

# –ó–≤–∏—á–∞–π–Ω–∞ –ª—ñ–Ω—ñ–π–Ω–∞ —Ä–µ–≥—Ä–µ—Å—ñ—è –¥–ª—è –ø–æ—Ä—ñ–≤–Ω—è–Ω–Ω—è
linear_reg = LinearRegression()
linear_reg.fit(X_train_scaled, y_train)

# 3. –ü—Ä–æ–≥–Ω–æ–∑–∏ —Ç–∞ –æ—Ü—ñ–Ω–∫–∏
y_pred_lasso = lasso.predict(X_test_scaled)
y_pred_linear = linear_reg.predict(X_test_scaled)

lasso_mse = mean_squared_error(y_test, y_pred_lasso)
linear_mse = mean_squared_error(y_test, y_pred_linear)

lasso_r2 = r2_score(y_test, y_pred_lasso)
linear_r2 = r2_score(y_test, y_pred_linear)

print(f"\nüìà –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –º–æ–¥–µ–ª–µ–π:")
print(f"Lasso Regression - MSE: {lasso_mse:.4f}, R¬≤: {lasso_r2:.4f}")
print(f"Linear Regression - MSE: {linear_mse:.4f}, R¬≤: {linear_r2:.4f}")

# 4. –ê–Ω–∞–ª—ñ–∑ –∫–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç—ñ–≤ —Ç–∞ —Å–µ–ª–µ–∫—Ü—ñ—ó –æ–∑–Ω–∞–∫
coefficients = pd.DataFrame({
    'Feature': all_feature_names,
    'Lasso_Coef': lasso.coef_,
    'Linear_Coef': linear_reg.coef_
})

# –§—ñ–ª—å—Ç—Ä—É—î–º–æ –Ω–µ–Ω—É–ª—å–æ–≤—ñ –∫–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç–∏ Lasso
non_zero_features = coefficients[coefficients['Lasso_Coef'] != 0].copy()
non_zero_features['Abs_Lasso_Coef'] = np.abs(non_zero_features['Lasso_Coef'])
non_zero_features = non_zero_features.sort_values('Abs_Lasso_Coef', ascending=False)

print(f"\nüéØ Lasso –≤–∏–±—Ä–∞–≤ {len(non_zero_features)} –∑ {len(all_feature_names)} –æ–∑–Ω–∞–∫:")
print(non_zero_features[['Feature', 'Lasso_Coef']].head(10))

# –í—ñ–∑—É–∞–ª—ñ–∑–∞—Ü—ñ—è
fig = plt.figure(figsize=(20, 15))

# 1. Cross-validation –∫—Ä–∏–≤—è –¥–ª—è alpha
plt.subplot(2, 3, 1)
plt.semilogx(lasso_cv.alphas_, lasso_cv.mse_path_.mean(axis=1), 'b-', alpha=0.6)
plt.semilogx(lasso_cv.alphas_, lasso_cv.mse_path_.mean(axis=1) + lasso_cv.mse_path_.std(axis=1), 'b--', alpha=0.3)
plt.semilogx(lasso_cv.alphas_, lasso_cv.mse_path_.mean(axis=1) - lasso_cv.mse_path_.std(axis=1), 'b--', alpha=0.3)
plt.axvline(optimal_alpha, color='red', linestyle='--', label=f'Optimal Œ± = {optimal_alpha:.6f}')
plt.xlabel('Alpha (Regularization Parameter)')
plt.ylabel('Mean Squared Error')
plt.title('Cross-Validation –¥–ª—è –≤–∏–±–æ—Ä—É Alpha')
plt.legend()
plt.grid(True, alpha=0.3)

# 2. –ü–æ—Ä—ñ–≤–Ω—è–Ω–Ω—è –ø—Ä–æ–≥–Ω–æ–∑—ñ–≤
plt.subplot(2, 3, 2)
plt.scatter(y_test, y_pred_lasso, alpha=0.6, label=f'Lasso (R¬≤ = {lasso_r2:.3f})', s=30)
plt.scatter(y_test, y_pred_linear, alpha=0.6, label=f'Linear (R¬≤ = {linear_r2:.3f})', s=30)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('–°–ø—Ä–∞–≤–∂–Ω—ñ —Ü—ñ–Ω–∏')
plt.ylabel('–ü—Ä–æ–≥–Ω–æ–∑–æ–≤–∞–Ω—ñ —Ü—ñ–Ω–∏')
plt.title('–ü–æ—Ä—ñ–≤–Ω—è–Ω–Ω—è –ø—Ä–æ–≥–Ω–æ–∑—ñ–≤ –º–æ–¥–µ–ª–µ–π')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. –ö–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç–∏ –Ω–∞–π–≤–∞–∂–ª–∏–≤—ñ—à–∏—Ö –æ–∑–Ω–∞–∫
plt.subplot(2, 3, 3)
top_features = non_zero_features.head(15)
colors = ['green' if coef > 0 else 'red' for coef in top_features['Lasso_Coef']]
bars = plt.barh(range(len(top_features)), top_features['Lasso_Coef'], color=colors, alpha=0.7)
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('–ö–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç Lasso')
plt.title('–¢–æ–ø-15 –Ω–∞–π–≤–∞–∂–ª–∏–≤—ñ—à–∏—Ö –æ–∑–Ω–∞–∫')
plt.grid(True, alpha=0.3)
plt.gca().invert_yaxis()

# 4. –†–æ–∑–ø–æ–¥—ñ–ª –∫–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç—ñ–≤
plt.subplot(2, 3, 4)
plt.hist(coefficients['Lasso_Coef'], bins=50, alpha=0.7, label='Lasso', edgecolor='black')
plt.axvline(0, color='red', linestyle='--', alpha=0.8)
plt.xlabel('–ó–Ω–∞—á–µ–Ω–Ω—è –∫–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç—ñ–≤')
plt.ylabel('–ß–∞—Å—Ç–æ—Ç–∞')
plt.title('–†–æ–∑–ø–æ–¥—ñ–ª –∫–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç—ñ–≤ Lasso')
plt.legend()
plt.grid(True, alpha=0.3)

# 5. –ö—ñ–ª—å–∫—ñ—Å—Ç—å –Ω–µ–Ω—É–ª—å–æ–≤–∏—Ö –∫–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç—ñ–≤ vs Alpha
plt.subplot(2, 3, 5)
alphas_range = np.logspace(-4, 1, 30)
n_nonzero = []
for alpha in alphas_range:
    lasso_temp = Lasso(alpha=alpha, max_iter=2000)
    lasso_temp.fit(X_train_scaled, y_train)
    n_nonzero.append(np.sum(lasso_temp.coef_ != 0))

plt.semilogx(alphas_range, n_nonzero, 'bo-', markersize=4)
plt.axvline(optimal_alpha, color='red', linestyle='--', label=f'Optimal Œ±')
plt.xlabel('Alpha')
plt.ylabel('–ö—ñ–ª—å–∫—ñ—Å—Ç—å –Ω–µ–Ω—É–ª—å–æ–≤–∏—Ö –∫–æ–µ—Ñ—ñ—Ü—ñ—î–Ω—Ç—ñ–≤')
plt.title('–°–µ–ª–µ–∫—Ü—ñ—è –æ–∑–Ω–∞–∫ –∑–∞–ª–µ–∂–Ω–æ –≤—ñ–¥ Alpha')
plt.legend()
plt.grid(True, alpha=0.3)

# 6. –ó–∞–ª–∏—à–∫–∏ –º–æ–¥–µ–ª—ñ
plt.subplot(2, 3, 6)
residuals = y_test - y_pred_lasso
plt.scatter(y_pred_lasso, residuals, alpha=0.6, s=30)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('–ü—Ä–æ–≥–Ω–æ–∑–æ–≤–∞–Ω—ñ –∑–Ω–∞—á–µ–Ω–Ω—è')
plt.ylabel('–ó–∞–ª–∏—à–∫–∏')
plt.title('–ê–Ω–∞–ª—ñ–∑ –∑–∞–ª–∏—à–∫—ñ–≤ Lasso –º–æ–¥–µ–ª—ñ')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# –î–æ–¥–∞—Ç–∫–æ–≤–∏–π –∞–Ω–∞–ª—ñ–∑: –ø–æ—Ä—ñ–≤–Ω—è–Ω–Ω—è –∑ —Ä—ñ–∑–Ω–∏–º–∏ –∑–Ω–∞—á–µ–Ω–Ω—è–º–∏ alpha
print(f"\nüî¨ –î–æ–¥–∞—Ç–∫–æ–≤–∏–π –∞–Ω–∞–ª—ñ–∑ –≤–ø–ª–∏–≤—É —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü—ñ—ó:")
alphas_demo = [0.001, optimal_alpha, 0.1, 1.0]
results_table = []

for alpha in alphas_demo:
    lasso_temp = Lasso(alpha=alpha, max_iter=2000)
    lasso_temp.fit(X_train_scaled, y_train)
    y_pred_temp = lasso_temp.predict(X_test_scaled)
    mse_temp = mean_squared_error(y_test, y_pred_temp)
    r2_temp = r2_score(y_test, y_pred_temp)
    n_features_temp = np.sum(lasso_temp.coef_ != 0)
    
    results_table.append({
        'Alpha': alpha,
        'MSE': mse_temp,
        'R¬≤': r2_temp,
        'N_Features': n_features_temp
    })

results_df = pd.DataFrame(results_table)
print(results_df.round(4))

print(f"\n‚úÖ –í–∏—Å–Ω–æ–≤–∫–∏:")
print(f"‚Ä¢ Lasso –∞–≤—Ç–æ–º–∞—Ç–∏—á–Ω–æ –≤–∏–±—Ä–∞–≤ {len(non_zero_features)} –Ω–∞–π–≤–∞–∂–ª–∏–≤—ñ—à–∏—Ö –æ–∑–Ω–∞–∫ –∑ {len(all_feature_names)}")
print(f"‚Ä¢ –í–∏–¥–∞–ª–∏–≤ —É—Å—ñ 10 —à—É–º–Ω–∏—Ö –æ–∑–Ω–∞–∫ —Ç–∞ –±–∞–≥–∞—Ç–æ –Ω–µ—Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–∏—Ö –≤–∑–∞—î–º–æ–¥—ñ–π")
print(f"‚Ä¢ –ü–æ–∫—Ä–∞—â–∏–≤ —ñ–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞–±–µ–ª—å–Ω—ñ—Å—Ç—å –º–æ–¥–µ–ª—ñ –±–µ–∑ –∑–Ω–∞—á–Ω–æ—ó –≤—Ç—Ä–∞—Ç–∏ —Ç–æ—á–Ω–æ—Å—Ç—ñ")
print(f"‚Ä¢ –û–ø—Ç–∏–º–∞–ª—å–Ω–∏–π alpha ({optimal_alpha:.6f}) –∑–∞–±–µ–∑–ø–µ—á—É—î –±–∞–ª–∞–Ω—Å –º—ñ–∂ —Ç–æ—á–Ω—ñ—Å—Ç—é —Ç–∞ –ø—Ä–æ—Å—Ç–æ—Ç–æ—é")

üè† –ê–Ω–∞–ª—ñ–∑ —Ü—ñ–Ω –Ω–∞ –Ω–µ—Ä—É—Ö–æ–º—ñ—Å—Ç—å –∑–∞ –¥–æ–ø–æ–º–æ–≥–æ—é Lasso Regression
üìä –†–æ–∑–º—ñ—Ä –¥–∞—Ç–∞—Å–µ—Ç—É: (20640, 9)
üìã –û–∑–Ω–∞–∫–∏: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
üìà –°–µ—Ä–µ–¥–Ω—è —Ü—ñ–Ω–∞: $2.07 (–≤ —Å–æ—Ç–Ω—è—Ö —Ç–∏—Å—è—á)
üîß –†–æ–∑—à–∏—Ä–µ–Ω–∏–π –Ω–∞–±—ñ—Ä –æ–∑–Ω–∞–∫: 54 –æ–∑–Ω–∞–∫

üîç –ü–æ—à—É–∫ –æ–ø—Ç–∏–º–∞–ª—å–Ω–æ–≥–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞ —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü—ñ—ó...
üéØ –û–ø—Ç–∏–º–∞–ª—å–Ω–∏–π alpha: 0.006866

üìà –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –º–æ–¥–µ–ª–µ–π:
Lasso Regression - MSE: 0.5584, R¬≤: 0.5739
Linear Regression - MSE: 0.4650, R¬≤: 0.6452

üéØ Lasso –≤–∏–±—Ä–∞–≤ 20 –∑ 54 –æ–∑–Ω–∞–∫:
                  Feature  Lasso_Coef
6                Latitude   -0.783592
7               Longitude   -0.743563
0                  MedInc    0.442935
14     MedInc √ó Longitude   -0.274965
8       MedInc √ó HouseAge    0.187547
15    HouseAge √ó AveRooms   -0.136438
36                MedInc¬≤   -0