In [2]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, roc_curve
from sklearn.cluster import KMeans
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# 1. File Inspection
print("Current working directory:", os.getcwd())
df = pd.read_excel('5000_rental_crm_leads.xlsx')

# Display basic information about the dataset
print("\nDataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
print(df.head())

# 2. Feature Engineering - Creating meaningful features from existing columns

def create_features(df):
    """Create meaningful features from the existing dataset columns"""
    df = df.copy()
    
    # 1. Budget Features
    # Budget range (difference between max and min)
    df['budget_range'] = df['budget_max'] - df['budget_min']
    
    # Budget midpoint (average budget)
    df['budget_mid'] = (df['budget_min'] + df['budget_max']) / 2
    
    # Budget flexibility (ratio of range to midpoint)
    df['budget_flexibility'] = df['budget_range'] / (df['budget_mid'] + 1)
    
    # Normalize budget features (0-1 scale)
    df['budget_mid_norm'] = (df['budget_mid'] - df['budget_mid'].min()) / (df['budget_mid'].max() - df['budget_mid'].min())
    df['budget_range_norm'] = (df['budget_range'] - df['budget_range'].min()) / (df['budget_range'].max() - df['budget_range'].min())
    
    # 2. Area Features
    # Area popularity score (frequency-based)
    area_freq = df['preferred_area'].value_counts(normalize=True)
    df['area_popularity'] = df['preferred_area'].map(area_freq)
    
    # Area tier (based on popularity)
    df['area_tier'] = pd.cut(df['area_popularity'], 
                            bins=[0, 0.02, 0.04, 1.0], 
                            labels=['Low', 'Medium', 'High'])
    
    # 3. User Type Features
    # User type encoding (one-hot style with numerical values)
    user_type_mapping = {
        'Family': 3,
        'Working Professionals': 2,
        'Bachelor': 1,
        'Company Guest': 2
    }
    df['user_type_score'] = df['user_type'].map(user_type_mapping).fillna(1)
    
    # 4. BHK Features
    # BHK preference score (higher BHK = higher score)
    df['bhk_score'] = df['bhk'] / 3.0  # Normalize by max BHK
    
    # 5. Move-in Time Features
    # Urgency score based on move-in time
    move_in_mapping = {
        'Immediate': 4,
        'Within 15 Days': 3,
        '1 Month': 2,
        '2 Months': 1
    }
    df['move_in_urgency'] = df['move_in_time'].map(move_in_mapping).fillna(1)
    
    # 6. Source Features
    # Source quality score (based on typical conversion rates)
    source_mapping = {
        'Referral': 4,
        'Website': 3,
        'WhatsApp': 3,
        'Facebook': 2,
        'Instagram': 2,
        'Google Ads': 1
    }
    df['source_quality'] = df['source'].map(source_mapping).fillna(1)
    
    # 7. Composite Features
    # Lead Quality Score (weighted combination of features)
    df['lead_quality_score'] = (
        0.3 * df['budget_mid_norm'] +
        0.2 * df['area_popularity'] +
        0.2 * df['user_type_score'] / 3.0 +
        0.1 * df['bhk_score'] +
        0.1 * df['move_in_urgency'] / 4.0 +
        0.1 * df['source_quality'] / 4.0
    )
    
    return df

# Apply feature engineering
df_featured = create_features(df)

# 3. Create Target Variable (y)
# Since there's no conversion data, create a meaningful target based on lead quality

def create_target_variable(df):
    """Create a target variable based on lead quality indicators"""
    df = df.copy()
    
    # Define high-quality lead criteria
    high_budget = df['budget_mid'] > df['budget_mid'].quantile(0.6)
    popular_area = df['area_popularity'] > df['area_popularity'].quantile(0.6)
    urgent_move = df['move_in_urgency'] >= 3
    good_source = df['source_quality'] >= 3
    family_or_working = df['user_type'].isin(['Family', 'Working Professionals'])
    
    # Create target: 1 for high-quality leads, 0 for others
    df['is_high_quality'] = (
        (high_budget.astype(int) + 
         popular_area.astype(int) + 
         urgent_move.astype(int) + 
         good_source.astype(int) + 
         family_or_working.astype(int)) >= 3
    ).astype(int)
    
    return df

df_featured = create_target_variable(df_featured)

# 4. Prepare X and y

# Select features for the model
feature_cols = [
    'budget_mid_norm',
    'budget_range_norm',
    'budget_flexibility',
    'area_popularity',
    'user_type_score',
    'bhk_score',
    'move_in_urgency',
    'source_quality',
    'lead_quality_score'
]

# Add categorical features
categorical_features = ['source', 'preferred_area', 'user_type', 'area_tier']
feature_cols.extend(categorical_features)

# Create X and y
X = df_featured[feature_cols].copy()
y = df_featured['is_high_quality'].copy()

print("\nFeature columns:", feature_cols)
print("\nTarget distribution:")
print(y.value_counts())
print(f"High-quality leads: {y.mean():.2%}")

# 5. Preprocessing

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"\nNumerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

# Create preprocessing pipeline
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# 6. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Training target distribution: {y_train.value_counts(normalize=True)}")
print(f"Test target distribution: {y_test.value_counts(normalize=True)}")

# 7. Train Models

# 7.1 RandomForest
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42,
        class_weight='balanced'
    ))
])

rf_model.fit(X_train, y_train)

# 7.2 XGBoost
xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    ))
])

xgb_model.fit(X_train, y_train)

# 8. Evaluate Models

def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model performance"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    print(f"\n{model_name} Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return y_pred, y_proba

# Evaluate both models
rf_pred, rf_proba = evaluate_model(rf_model, X_test, y_test, "RandomForest")
xgb_pred, xgb_proba = evaluate_model(xgb_model, X_test, y_test, "XGBoost")

# 9. Feature Importance

def get_feature_importance(model, feature_names):
    """Get feature importance from the model"""
    if hasattr(model.named_steps['classifier'], 'feature_importances_'):
        importances = model.named_steps['classifier'].feature_importances_
    else:
        return None
    
    # Get feature names after preprocessing
    preprocessor = model.named_steps['preprocessor']
    
    # Get numerical feature names
    num_features = preprocessor.named_transformers_['num'].named_steps['scaler'].get_feature_names_out()
    
    # Get categorical feature names
    cat_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
    
    all_features = np.concatenate([num_features, cat_features])
    
    # Create importance dataframe
    importance_df = pd.DataFrame({
        'feature': all_features,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    return importance_df

# Get feature importances
rf_importance = get_feature_importance(rf_model, feature_cols)
xgb_importance = get_feature_importance(xgb_model, feature_cols)

print("\nTop 10 Important Features (RandomForest):")
print(rf_importance.head(10))

print("\nTop 10 Important Features (XGBoost):")
print(xgb_importance.head(10))

# 10. Generate Lead Scores

# Select the better model based on AUC
rf_auc = roc_auc_score(y_test, rf_proba)
xgb_auc = roc_auc_score(y_test, xgb_proba)

if xgb_auc > rf_auc:
    best_model = xgb_model
    best_model_name = "XGBoost"
    best_auc = xgb_auc
else:
    best_model = rf_model
    best_model_name = "RandomForest"
    best_auc = rf_auc

print(f"\nBest model: {best_model_name} (AUC: {best_auc:.4f})")

# Generate lead scores for all data
all_scores = best_model.predict_proba(X)[:, 1]
df_featured['lead_score'] = (all_scores * 100).round(1)

# Create lead categories
df_featured['lead_category'] = pd.cut(
    df_featured['lead_score'],
    bins=[0, 40, 70, 100],
    labels=['Cold', 'Warm', 'Hot']
)

# Display results
print("\nLead Score Distribution:")
print(df_featured['lead_category'].value_counts())

print("\nTop 20 Leads by Score:")
top_leads = df_featured.sort_values('lead_score', ascending=False).head(20)
print(top_leads[['lead_id', 'name', 'preferred_area', 'budget_mid', 'lead_score', 'lead_category']])

# 11. Save Results
output_file = 'lead_scoring_results.xlsx'
with pd.ExcelWriter(output_file) as writer:
    # Save all scored leads
    df_featured.to_excel(writer, sheet_name='All_Leads_Scored', index=False)
    
    # Save top leads
    top_leads.to_excel(writer, sheet_name='Top_20_Leads', index=False)
    
    # Save feature importance
    rf_importance.to_excel(writer, sheet_name='RF_Feature_Importance', index=False)
    xgb_importance.to_excel(writer, sheet_name='XGB_Feature_Importance', index=False)

print(f"\nResults saved to: {output_file}")

Current working directory: C:\Users\HP\Desktop\data science\ML Project Rental New

Dataset shape: (5000, 10)

Columns: ['lead_id', 'name', 'phone', 'source', 'budget_min', 'budget_max', 'preferred_area', 'bhk', 'user_type', 'move_in_time']

First 5 rows:
   lead_id    name       phone     source  budget_min  budget_max  \
0        1  Ritesh  9621539141   Facebook       10000       18000   
1        2    Amit  9557465220  Instagram       20000       20000   
2        3  Anjali  9050891432  Instagram        7000       12000   
3        4   Priya  9124138450   Facebook        8000       25000   
4        5  Ayesha  9259367346  Instagram       20000       15000   

  preferred_area  bhk      user_type move_in_time  
0        Andheri    3         Family      1 Month  
1          Malad    3  Company Guest     2 Months  
2       Madhapur    2         Family      1 Month  
3          Thane    3       Bachelor     2 Months  
4        Kharadi    2  Company Guest     2 Months  

Feature columns: 