In [None]:
# Cell 1: Install required libraries
!pip install -q streamlit plotly joblib fpdf xgboost pyngrok


In [1]:
# Cell 2: Import necessary libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import os


In [None]:
# Cell 3: Upload dataset (Colab or Jupyter)
import pandas as pd
import io

try:
    from google.colab import files
    uploaded = files.upload()
    for fn in uploaded.keys():
        df = pd.read_csv(io.BytesIO(uploaded[fn]))
        print(f"✅ Loaded: {fn}")
        break
except ImportError:
    # Jupyter
    import ipywidgets as widgets
    from IPython.display import display

    upload = widgets.FileUpload(accept='.csv', multiple=False)
    display(upload)

    def handle_upload(change):
        if upload.value:
            name = list(upload.value.keys())[0]
            content = upload.value[name]['content']
            global df
            df = pd.read_csv(io.BytesIO(content))
            print(f"✅ Loaded: {name}")
            display(df.head())

    upload.observe(handle_upload, names='value')


In [None]:
# Cell 4: Preprocess data and save encoders
categorical_cols = ['Gender', 'Education', 'Department', 'JobTitle']
encoders = {}
encoder_classes = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    encoder_classes[col] = le.classes_.tolist()  # Save for decoding later

X = df.drop(['EmployeeID', 'Salary'], axis=1)
y = df['Salary']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save all artifacts
joblib.dump(feature_columns := X.columns.tolist(), "feature_columns.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(encoders, "encoders.pkl")
joblib.dump(encoder_classes, "encoder_classes.pkl")


In [4]:
# Cell 5: Train models and save
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import joblib

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ✅ Save training data to display R² scores in Streamlit app
joblib.dump(X_train, "X_scaled_train.pkl")
joblib.dump(y_train, "y_train.pkl")

# Train models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Fit and save each model
for name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"{name}_model.pkl")



In [5]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import plotly.graph_objs as go

st.set_page_config(page_title="Salary Predictor", layout="wide")

# Load artifacts
scaler = joblib.load("scaler.pkl")
encoders = joblib.load("encoders.pkl")
feature_columns = joblib.load("feature_columns.pkl")
encoder_classes = joblib.load("encoder_classes.pkl")

models = {
    "Linear Regression": joblib.load("LinearRegression_model.pkl"),
    "Random Forest": joblib.load("RandomForest_model.pkl"),
    "XGBoost": joblib.load("XGBoost_model.pkl")
}

# Sidebar
st.sidebar.title("Salary Predictor")
model_name = st.sidebar.selectbox("Choose a model", list(models.keys()))

# Title
st.title("💼 Employee Salary Prediction App")

uploaded_file = st.file_uploader("📂 Upload Employee Dataset (CSV)", type=["csv"])
if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    st.success("✅ Dataset successfully uploaded!")
    st.write("Preview of Uploaded Data:", df.head())

with st.form("input_form"):
    st.subheader("📝 Enter Employee Information")
    col1, col2 = st.columns(2)

    with col1:
        Gender = st.selectbox("Gender", encoders['Gender'].classes_)
        Education = st.selectbox("Education", encoders['Education'].classes_)
        Department = st.selectbox("Department", encoders['Department'].classes_)
        JobTitle = st.selectbox("Job Title", encoders['JobTitle'].classes_)

    with col2:
        Age = st.number_input("Age", min_value=18, max_value=65, value=30)
        WorkExperience = st.number_input("Work Experience", min_value=0, max_value=40, value=5)
        PerformanceScore = st.number_input("Performance Score", min_value=1.0, max_value=5.0, value=3.0)
        Certifications = st.number_input("Certifications", min_value=0, max_value=10, value=2)
        ProjectsHandled = st.number_input("Projects Handled", min_value=0, max_value=50, value=5)
        PreviousCompanyRating = st.number_input("Previous Company Rating", min_value=1.0, max_value=5.0, value=3.0)

    submit = st.form_submit_button("🚀 Predict Salary")

if submit:
    input_data_encoded = {
        'Gender': encoders['Gender'].transform([Gender])[0],
        'Education': encoders['Education'].transform([Education])[0],
        'Department': encoders['Department'].transform([Department])[0],
        'JobTitle': encoders['JobTitle'].transform([JobTitle])[0],
        'Age': Age,
        'WorkExperience': WorkExperience,
        'PerformanceScore': PerformanceScore,
        'Certifications': Certifications,
        'ProjectsHandled': ProjectsHandled,
        'PreviousCompanyRating': PreviousCompanyRating
    }

    input_df = pd.DataFrame([input_data_encoded])
    input_scaled = scaler.transform(input_df[feature_columns])
    model = models[model_name]
    prediction = model.predict(input_scaled)[0]

    # Prepare user-friendly display result
    display_data = {
        'Gender': Gender,
        'Education': Education,
        'Department': Department,
        'JobTitle': JobTitle,
        'Age': Age,
        'WorkExperience': WorkExperience,
        'PerformanceScore': PerformanceScore,
        'Certifications': Certifications,
        'ProjectsHandled': ProjectsHandled,
        'PreviousCompanyRating': PreviousCompanyRating,
        'Predicted Salary': prediction
    }
    result_df = pd.DataFrame([display_data])

    st.subheader("📊 Prediction Result")
    st.dataframe(result_df.style.format({'Predicted Salary': '${:,.2f}'}), use_container_width=True)

    # Explain prediction with SHAP
    st.subheader("🔍 Explanation of Prediction (SHAP Values)")

    X_train_shap = joblib.load("X_scaled_train.pkl")
    X_train_df = pd.DataFrame(X_train_shap, columns=feature_columns)
    input_df_named = pd.DataFrame(input_scaled, columns=feature_columns)

    explainer = shap.Explainer(model, X_train_df)
    shap_values = explainer(input_df_named)

    st.markdown("#### Top Factors Influencing This Prediction")
    top_features = np.argsort(-np.abs(shap_values.values[0]))[:5]
    for i in top_features:
        feature_name = feature_columns[i]
        impact = shap_values.values[0][i]
        sign = "increased" if impact > 0 else "decreased"
        st.write(f"- **{feature_name}** {sign} predicted salary by **${abs(impact):,.2f}**")

    fig, ax = plt.subplots(figsize=(8, 4))
    shap.plots.bar(shap_values, max_display=10, show=False)
    st.pyplot(fig)

        # ---------------------- Model Configuration & Accuracy ---------------------- #
    st.subheader("⚙️ Model Configuration & Performance")

    # Load training data to calculate R² scores
    y_train = joblib.load("y_train.pkl")
    model_scores = {}

    for name, m in models.items():
        y_pred_train = m.predict(X_train_shap)
        score = r2_score(y_train, y_pred_train)
        model_scores[name] = round(score, 4)

    # Gauge-like plot for the selected model
    selected_score = model_scores[model_name]
    fig_gauge = go.Figure(go.Indicator(
        mode="gauge+number",
        value=selected_score * 100,
        title={'text': f"{model_name} R² Score (%)"},
        gauge={
            'axis': {'range': [0, 100]},
            'bar': {'color': "#00cc96"},
            'steps': [
                {'range': [0, 50], 'color': "#ffe6e6"},
                {'range': [50, 75], 'color': "#ffffcc"},
                {'range': [75, 100], 'color': "#e6ffe6"},
            ],
        }
    ))
    st.plotly_chart(fig_gauge, use_container_width=True)

    # Accuracy comparison table
    st.markdown("### 📈 Model Accuracy Comparison (on training set)")
    accuracy_df = pd.DataFrame.from_dict(model_scores, orient='index', columns=["R² Score"])
    st.dataframe(accuracy_df.style.format({"R² Score": "{:.2%}"}), use_container_width=True)

    # Performance interpretation
    st.markdown("### 📋 Model Performance ")
    for name, score in model_scores.items():
        interpretation = ""
        if score >= 0.9:
            interpretation = " Excellent performance – fits data very well."
        elif score >= 0.75:
            interpretation = " Good performance – suitable for predictions."
        elif score >= 0.6:
            interpretation = " Moderate – might underperform on complex data."
        else:
            interpretation = " Low – not recommended unless improved."

        st.markdown(f"**{name}**: R² = **{score:.2%}** – {interpretation}")


Writing app.py


In [9]:
from pyngrok import ngrok
ngrok.set_auth_token("Your Auth Token Here")



In [None]:
# Cell 7: 
from pyngrok import ngrok
!streamlit run app.py &>/content/logs.txt &  # For Colab
url = ngrok.connect(8501)
print("Public URL:", url)


In [None]:
!pkill -f streamlit
!pkill -f ngrok
