<a href="https://colab.research.google.com/github/YvHarshit/Salary-Prediction-Using-Ensemble-Learning/blob/main/Salary_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Individual Base Models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Ensemble Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingRegressor

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [None]:

from google.colab import files
import io

uploaded_file_name = '/content/sample_data/Latest_Data_Science_Salaries.csv'

try:
    print(f"Please upload your '{uploaded_file_name}' file.")
    uploaded = files.upload()

    # Read the uploaded file into a pandas DataFrame
    df = pd.read_csv(io.BytesIO(uploaded[uploaded_file_name]))
    print(f"'{uploaded_file_name}' uploaded and loaded successfully!")
    print("Dataset head:")
    print(df.head())
    print("\nDataset info:")
    df.info()

except FileNotFoundError:
    print(f"Error: '{uploaded_file_name}' not found. Please ensure the file is uploaded correctly.")
    print("If you're having trouble, ensure the file name matches exactly.")
except Exception as e:
    print(f"An error occurred during file upload or loading: {e}")
    # Create a dummy DataFrame if loading fails to allow code execution for demonstration
    print("Creating a dummy DataFrame for demonstration purposes.")
    data = {
        'work_year': [2023, 2023, 2022, 2023, 2023],
        'experience_level': ['SE', 'MI', 'EN', 'SE', 'EX'],
        'employment_type': ['FT', 'FT', 'FT', 'FT', 'FT'],
        'job_title': ['Data Scientist', 'Machine Learning Engineer', 'Data Analyst', 'Data Scientist', 'Director of Data Science'],
        'salary_currency': ['USD', 'USD', 'USD', 'USD', 'USD'],
        'salary': [150000, 100000, 80000, 160000, 250000],
        'salary_in_usd': [150000, 100000, 80000, 160000, 250000],
        'employee_residence': ['US', 'US', 'GB', 'CA', 'US'],
        'remote_ratio': [0, 0, 100, 0, 0],
        'company_location': ['US', 'US', 'GB', 'CA', 'US'],
        'company_size': ['L', 'M', 'S', 'L', 'L']
    }
    df = pd.DataFrame(data)
    print("Dummy DataFrame created.")
    print(df.head())

Please upload your '/content/sample_data/Latest_Data_Science_Salaries.csv' file.


Saving Latest_Data_Science_Salaries.csv to Latest_Data_Science_Salaries.csv
An error occurred during file upload or loading: '/content/sample_data/Latest_Data_Science_Salaries.csv'
Creating a dummy DataFrame for demonstration purposes.
Dummy DataFrame created.
   work_year experience_level employment_type                  job_title  \
0       2023               SE              FT             Data Scientist   
1       2023               MI              FT  Machine Learning Engineer   
2       2022               EN              FT               Data Analyst   
3       2023               SE              FT             Data Scientist   
4       2023               EX              FT   Director of Data Science   

  salary_currency  salary  salary_in_usd employee_residence  remote_ratio  \
0             USD  150000         150000                 US             0   
1             USD  100000         100000                 US             0   
2             USD   80000          80000           

In [None]:
print("\n--- Data Preprocessing ---")

# 3.1 Handle Missing Values
# Check for missing values
print("\nMissing values before handling:")
print(df.isnull().sum())


--- Data Preprocessing ---

Missing values before handling:
work_year             0
experience_level      0
employment_type       0
job_title             0
salary_currency       0
salary                0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64


In [None]:
# For simplicity, we'll drop rows with any missing values.
# In a real project, you might use imputation strategies (mean, median, mode)
# based on the nature of the missing data.
df.dropna(inplace=True)
print("\nMissing values after dropping rows:")
print(df.isnull().sum())
print(f"DataFrame shape after dropping NaNs: {df.shape}")


Missing values after dropping rows:
work_year             0
experience_level      0
employment_type       0
job_title             0
salary_currency       0
salary                0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
DataFrame shape after dropping NaNs: (5, 11)


In [None]:
# The target variable is 'salary_in_usd'.
target = 'salary_in_usd'
features = [col for col in df.columns if col not in ['salary', 'salary_currency', target]]

# Separate features into numerical and categorical
numerical_features = df[features].select_dtypes(include=np.number).columns.tolist()
categorical_features = df[features].select_dtypes(include='object').columns.tolist()

print(f"\nNumerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")


Numerical features: ['work_year', 'remote_ratio']
Categorical features: ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']


In [None]:
# Numerical pipeline: just scaling
numerical_transformer = StandardScaler()

# Categorical pipeline: one-hot encoding
# handle_unknown='ignore' will set unknown categories to all zeros, preventing errors
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("\nPreprocessing pipeline created.")


Preprocessing pipeline created.


In [None]:
# Numerical pipeline: just scaling
numerical_transformer = StandardScaler()

# Categorical pipeline: one-hot encoding
# handle_unknown='ignore' will set unknown categories to all zeros, preventing errors
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("\nPreprocessing pipeline created.")


Preprocessing pipeline created.


In [None]:
# Define Stacking Regressor
# The meta-model will learn how to combine the predictions of the base estimators.
# We'll use a subset of the base models for stacking to demonstrate.
stacked_regressor = StackingRegressor(
    estimators=[
        ('rf', RandomForestRegressor(n_estimators=50, random_state=42)), # Using fewer estimators for faster training in stacking
        ('xgb', xgb.XGBRegressor(n_estimators=50, random_state=42, use_label_encoder=False, eval_metric='rmse')),
        ('svr', SVR())
    ],
    final_estimator=Ridge(random_state=42), # Ridge Regression as the meta-model
    cv=3, # Number of cross-validation folds for stacking - Reduced for small dataset
    n_jobs=-1 # Use all available cores
)

# Define individual models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
lgb_model = lgb.LGBMRegressor(random_state=42)


# Create a dictionary of all models to train and evaluate
models = {
    "Linear Regression": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())]),
    "Decision Tree": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', DecisionTreeRegressor(random_state=42))]),
    "Support Vector Regressor": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', SVR())]),
    "K-Nearest Neighbors": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', KNeighborsRegressor(n_neighbors=3))]), # Reduced n_neighbors
    "Random Forest": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', rf_model)]),
    "Gradient Boosting": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', gb_model)]),
    "XGBoost": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', xgb_model)]),
    "LightGBM": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb_model)]),
    "Stacked Regressor": Pipeline(steps=[('preprocessor', preprocessor), ('regressor', stacked_regressor)])
}

print("\nModels defined and pipelines created.")


Models defined and pipelines created.


In [None]:
print("\n--- Model Training and Evaluation ---")

# Separate features (X) and target (y)
X = df[features]
y = df[target]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nData split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples).")

results = {}

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\nTraining and evaluating {name}...")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Perform K-Fold Cross-Validation
    # Note: cross_val_score will internally create new pipelines for each fold,
    # so the preprocessor will be fit correctly on each fold's training data.
    cv_mae_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error', n_jobs=-1)
    cv_rmse_scores = np.sqrt(-cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1))
    cv_r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2', n_jobs=-1)

    results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'CV_MAE_Mean': cv_mae_scores.mean(),
        'CV_MAE_Std': cv_mae_scores.std(),
        'CV_RMSE_Mean': cv_rmse_scores.mean(),
        'CV_RMSE_Std': cv_rmse_scores.std(),
        'CV_R2_Mean': cv_r2_scores.mean(),
        'CV_R2_Std': cv_r2_scores.std()
    }

    print(f"{name} - Test Set Metrics:")
    print(f"  MAE: {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R-squared: {r2:.4f}")
    print(f"  Cross-Validation MAE: {cv_mae_scores.mean():.2f} (+/- {cv_mae_scores.std():.2f})")
    print(f"  Cross-Validation RMSE: {cv_rmse_scores.mean():.2f} (+/- {cv_rmse_scores.std():.2f})")
    print(f"  Cross-Validation R-squared: {cv_r2_scores.mean():.4f} (+/- {cv_r2_scores.std():.4f})")


--- Model Training and Evaluation ---

Data split into training (4 samples) and testing (1 samples).

Training and evaluating Linear Regression...
Linear Regression - Test Set Metrics:
  MAE: 86607.14
  RMSE: 86607.14
  R-squared: nan
  Cross-Validation MAE: 68583.77 (+/- 37218.07)
  Cross-Validation RMSE: 68583.77 (+/- 37218.07)
  Cross-Validation R-squared: nan (+/- nan)

Training and evaluating Decision Tree...
Decision Tree - Test Set Metrics:
  MAE: 50000.00
  RMSE: 50000.00
  R-squared: nan
  Cross-Validation MAE: 68000.00 (+/- 46647.62)
  Cross-Validation RMSE: 68000.00 (+/- 46647.62)
  Cross-Validation R-squared: nan (+/- nan)

Training and evaluating Support Vector Regressor...
Support Vector Regressor - Test Set Metrics:
  MAE: 54999.90
  RMSE: 54999.90
  R-squared: nan
  Cross-Validation MAE: 61999.78 (+/- 36551.39)
  Cross-Validation RMSE: 61999.78 (+/- 36551.39)
  Cross-Validation R-squared: nan (+/- nan)

Training and evaluating K-Nearest Neighbors...
K-Nearest Neighbors

In [None]:
# Display all results in a table
results_df = pd.DataFrame(results).T
print("\n--- All Model Evaluation Results ---")
print(results_df.round(4))

# Identify the best performing model based on R-squared (higher is better)
best_model_name = results_df['R2'].idxmax()
print(f"\nBest performing model on test set (based on R-squared): {best_model_name}")



--- All Model Evaluation Results ---
                                  MAE         RMSE  R2   CV_MAE_Mean  \
Linear Regression          86607.1429   86607.1429 NaN  6.858377e+04   
Decision Tree              50000.0000   50000.0000 NaN  6.800000e+04   
Support Vector Regressor   54999.9028   54999.9028 NaN  6.199978e+04   
K-Nearest Neighbors        86666.6667   86666.6667 NaN  5.666667e+04   
Random Forest              66400.0000   66400.0000 NaN  5.686000e+04   
Gradient Boosting          26443.6788   26443.6788 NaN  4.735733e+04   
XGBoost                    50000.0156   50000.0154 NaN  5.800001e+04   
LightGBM                   60000.0000   60000.0000 NaN  5.800000e+04   
Stacked Regressor         966307.5660  966307.5660 NaN  8.034279e+07   

                            CV_MAE_Std  CV_RMSE_Mean   CV_RMSE_Std  \
Linear Regression         3.721807e+04  6.858377e+04  3.721807e+04   
Decision Tree             4.664762e+04  6.800000e+04  4.664762e+04   
Support Vector Regressor  3.655

In [None]:
print("\n--- Predicting Salary with Sample Data ---")

# Get the best trained model
# Check if best_model_name is NaN due to small dataset issues and assign a default if necessary
if pd.isna(best_model_name):
    print("Warning: R-squared values were all NaN. Assigning 'Linear Regression' as the best model for demonstration.")
    best_model_name = 'Linear Regression'

best_model = models[best_model_name]

# Define a function to predict salary for new sample data
def predict_new_salary(model, sample_data):

    if isinstance(sample_data, dict):
        sample_df = pd.DataFrame([sample_data])
    elif isinstance(sample_data, list):
        sample_df = pd.DataFrame(sample_data)
    else:
        raise ValueError("sample_data must be a dictionary or a list of dictionaries.")

    # Ensure the sample data has the same columns as the training data features
    # Fill missing columns with NaN, which the preprocessor might handle or you can impute
    for col in features:
        if col not in sample_df.columns:
            sample_df[col] = np.nan # Or a default value if appropriate

    # Reorder columns to match the training data's feature order
    sample_df = sample_df[features]

    # Predict
    predicted_salary = model.predict(sample_df)
    return predicted_salary


--- Predicting Salary with Sample Data ---


In [None]:
# Here is Example of how to use the predict_new_salary function..
sample_new_data = {
    'work_year': 2024,
    'experience_level': 'SE',
    'employment_type': 'FT',
    'job_title': 'Data Scientist',
    'employee_residence': 'US',
    'remote_ratio': 0,
    'company_location': 'US',
    'company_size': 'M'
}

predicted_salary = predict_new_salary(best_model, sample_new_data)
print(f"\nPredicted salary for the sample data: ${predicted_salary[0]:,.2f}")



Predicted salary for the sample data: $179,017.86


In [None]:
sample_new_3 = {
    'work_year': 2025,
    'experience_level': 'EX', # Executive
    'employment_type': 'FT',
    'job_title': 'Director of Data Science',
    'salary_currency': 'GBP',
    'employee_residence': 'GB', # United Kingdom
    'remote_ratio': 0, # Fully in-person
    'company_location': 'GB',
    'company_size': 'L' # Large company
}
predicted_salary = predict_new_salary(best_model, sample_new_3)
print(f"\nPredicted salary for the sample data: ${predicted_salary[0]:,.2f}")


Predicted salary for the sample data: $306,339.29
