<a href="https://colab.research.google.com/github/bhanuprakashpd7/predictive-analysis/blob/house-price-prediction/Untitled22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Household Income Prediction using Machine Learning
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Generate synthetic household data for demonstration
np.random.seed(42)
n_samples = 1000

# Create synthetic household dataset
data = {
    'age': np.random.randint(25, 65, n_samples),
    'education_years': np.random.randint(12, 20, n_samples),
    'family_size': np.random.randint(1, 6, n_samples),
    'work_hours_per_week': np.random.randint(20, 60, n_samples),
    'years_experience': np.random.randint(0, 40, n_samples),
    'occupation': np.random.choice(['Manager', 'Engineer', 'Teacher', 'Sales', 'Other'], n_samples),
    'location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples),
    'home_ownership': np.random.choice(['Own', 'Rent'], n_samples)
}

# Create target variable (household income) with some realistic correlations
base_income = (
    data['education_years'] * 3000 +
    data['work_hours_per_week'] * 800 +
    data['years_experience'] * 1200 +
    np.where(data['occupation'] == 'Manager', 15000, 0) +
    np.where(data['occupation'] == 'Engineer', 12000, 0) +
    np.where(data['location'] == 'Urban', 8000, 0) +
    np.where(data['home_ownership'] == 'Own', 5000, 0) +
    np.random.normal(0, 5000, n_samples)  # Add noise
)

data['household_income'] = np.maximum(base_income, 25000)  # Minimum income floor

# Create DataFrame
df = pd.DataFrame(data)

print("Dataset Overview:")
print(df.head())
print(f"\nDataset shape: {df.shape}")
print(f"\nIncome statistics:")
print(df['household_income'].describe())

# Data Preprocessing
print("\n" + "="*50)
print("DATA PREPROCESSING")
print("="*50)

# Handle categorical variables
le_occupation = LabelEncoder()
le_location = LabelEncoder()
le_home = LabelEncoder()

df['occupation_encoded'] = le_occupation.fit_transform(df['occupation'])
df['location_encoded'] = le_location.fit_transform(df['location'])
df['home_ownership_encoded'] = le_home.fit_transform(df['home_ownership'])

# Select features for modeling
features = ['age', 'education_years', 'family_size', 'work_hours_per_week',
           'years_experience', 'occupation_encoded', 'location_encoded',
           'home_ownership_encoded']

X = df[features]
y = df['household_income']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Model Training and Evaluation
print("\n" + "="*50)
print("MODEL TRAINING & EVALUATION")
print("="*50)

# 1. Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

# 2. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Evaluation metrics
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{model_name} Results:")
    print(f"RMSE: ${rmse:,.2f}")
    print(f"MAE: ${mae:,.2f}")
    print(f"R² Score: {r2:.4f}")

    return {'RMSE': rmse, 'MAE': mae, 'R2': r2}

lr_results = evaluate_model(y_test, lr_pred, "Linear Regression")
rf_results = evaluate_model(y_test, rf_pred, "Random Forest")

# Feature Importance (Random Forest)
print("\n" + "="*30)
print("FEATURE IMPORTANCE")
print("="*30)

feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)

# Prediction function
def predict_household_income(age, education_years, family_size, work_hours,
                           years_exp, occupation, location, home_ownership):
    """
    Predict household income for new data
    """
    # Encode categorical variables
    occ_encoded = le_occupation.transform([occupation])[0]
    loc_encoded = le_location.transform([location])[0]
    home_encoded = le_home.transform([home_ownership])[0]

    # Create feature array
    features_array = np.array([[age, education_years, family_size, work_hours,
                               years_exp, occ_encoded, loc_encoded, home_encoded]])

    # Scale features
    features_scaled = scaler.transform(features_array)

    # Make predictions
    lr_prediction = lr_model.predict(features_scaled)[0]
    rf_prediction = rf_model.predict(features_array)[0]

    return {
        'Linear Regression': lr_prediction,
        'Random Forest': rf_prediction
    }

# Example predictions
print("\n" + "="*30)
print("EXAMPLE PREDICTIONS")
print("="*30)

examples = [
    {
        'age': 35, 'education_years': 16, 'family_size': 3, 'work_hours': 45,
        'years_exp': 10, 'occupation': 'Engineer', 'location': 'Urban',
        'home_ownership': 'Own'
    },
    {
        'age': 28, 'education_years': 14, 'family_size': 2, 'work_hours': 40,
        'years_exp': 5, 'occupation': 'Teacher', 'location': 'Suburban',
        'home_ownership': 'Rent'
    }
]

for i, example in enumerate(examples, 1):
    print(f"\nExample {i}: {example}")
    predictions = predict_household_income(**example)
    print(f"Predicted Income - LR: ${predictions['Linear Regression']:,.2f}")
    print(f"Predicted Income - RF: ${predictions['Random Forest']:,.2f}")

# Model Comparison Summary
print("\n" + "="*40)
print("MODEL COMPARISON SUMMARY")
print("="*40)

comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'RMSE': [lr_results['RMSE'], rf_results['RMSE']],
    'MAE': [lr_results['MAE'], rf_results['MAE']],
    'R²': [lr_results['R2'], rf_results['R2']]
})

print(comparison_df)

# Tips for improvement
print("\n" + "="*30)
print("TIPS FOR IMPROVEMENT")
print("="*30)
print("1. Collect more diverse and representative data")
print("2. Engineer new features (e.g., age groups, income brackets)")
print("3. Try advanced models (XGBoost, Neural Networks)")
print("4. Perform hyperparameter tuning")
print("5. Handle outliers and missing values more carefully")
print("6. Use cross-validation for better model evaluation")
print("7. Consider ensemble methods combining multiple models")

Dataset Overview:
   age  education_years  family_size  work_hours_per_week  years_experience  \
0   63               17            2                   56                22   
1   53               19            4                   40                 4   
2   39               12            4                   27                30   
3   32               17            3                   28                 5   
4   45               14            3                   34                36   

  occupation  location home_ownership  household_income  
0   Engineer     Urban           Rent     149534.366451  
1      Sales  Suburban            Own     101496.402254  
2      Sales     Rural           Rent      91061.333484  
3    Teacher  Suburban            Own      75102.933370  
4    Manager  Suburban            Own     133545.734172  

Dataset shape: (1000, 9)

Income statistics:
count      1000.000000
mean     111875.499553
std       19628.386101
min       55922.291852
25%       98331.16338

