<a href="https://colab.research.google.com/github/aniketverma-14/Employee-Salary-Prediction/blob/main/Employee_Job_Change_%26_Salary_Growth_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving aug_train.csv to aug_train (6).csv


In [2]:
import pandas as pd
import numpy as np
import pickle, gzip, shutil
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, mean_absolute_error

In [3]:
df = pd.read_csv("aug_train.csv")

In [4]:
# Drop rows with missing target
df.dropna(subset=['target'], inplace=True)

In [5]:
# Add simulated salary based on experience
def salary_simulation(row):
    exp = row['experience']
    if exp == '<1':
        exp = 0.5
    elif exp == '>20':
        exp = 21
    else:
        try:
            exp = float(exp)
        except:
            exp = 2
    base = 3 + exp * np.random.uniform(0.4, 1.0)
    return round(base * 1e5, -3)

In [6]:
df['experience_numeric'] = df['experience'].replace({'<1': 0.5, '>20': 21}).astype(float)
df['current_salary'] = df.apply(salary_simulation, axis=1)
df['expected_salary'] = df['current_salary'] * df['experience_numeric'].apply(lambda x: 1 + np.random.uniform(0.15, 0.35))
df['salary_growth_percent'] = ((df['expected_salary'] - df['current_salary']) / df['current_salary']) * 100

In [7]:
cat_cols = ['gender', 'relevent_experience', 'enrolled_university', 'education_level',
            'major_discipline', 'company_size', 'company_type', 'last_new_job', 'city']

In [8]:
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

df.drop(['enrollee_id', 'experience', 'training_hours'], axis=1, inplace=True)

In [9]:
Xc = df.drop(['target', 'expected_salary', 'salary_growth_percent'], axis=1)
yc = df['target']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42)

In [10]:
clf = RandomForestClassifier()
clf.fit(Xc_train, yc_train)
print("Classification Report:\n", classification_report(yc_test, clf.predict(Xc_test)))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.88      0.86      2880
         1.0       0.57      0.49      0.53       952

    accuracy                           0.78      3832
   macro avg       0.71      0.68      0.69      3832
weighted avg       0.77      0.78      0.78      3832



In [11]:
import json

# Save column names
with open("feature_list.json", "w") as f:
    json.dump(Xc_train.columns.tolist(), f)

# Save model as .gz
import gzip
with gzip.open("job_change_model.pkl.gz", "wb") as f:
    pickle.dump(clf, f)

In [12]:
df_reg = df[df['target'] == 1]
Xr = df_reg.drop(['target', 'expected_salary', 'salary_growth_percent'], axis=1)
yr = df_reg['salary_growth_percent']
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)

In [13]:
# Save and compress models
with open("job_change_model.pkl", "wb") as f:
    pickle.dump(clf, f)

reg = RandomForestRegressor()
# Drop rows with NaN in yr_train
Xr_train_cleaned, yr_train_cleaned = Xr_train[~yr_train.isna()], yr_train[~yr_train.isna()]
reg.fit(Xr_train_cleaned, yr_train_cleaned)

with open("salary_growth_model.pkl", "wb") as f:
    pickle.dump(reg, f)

In [14]:
# Compress
with open("job_change_model.pkl", 'rb') as f_in, gzip.open("job_change_model.pkl.gz", 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [15]:
with open("salary_growth_model.pkl", 'rb') as f_in, gzip.open("salary_growth_model.pkl.gz", 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [16]:
# Download
from google.colab import files
files.download("job_change_model.pkl.gz")
files.download("salary_growth_model.pkl.gz")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
!pip install streamlit
import streamlit as st
import pandas as pd
import gzip, pickle



In [18]:
# Load models
with gzip.open("job_change_model.pkl.gz", "rb") as f:
    job_model = pickle.load(f)

with gzip.open("salary_growth_model.pkl.gz", "rb") as f:
    salary_model = pickle.load(f)

st.title("Job Change & Salary Growth Predictor")

2025-07-20 10:32:04.357 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [19]:
# Input form
experience = st.slider("Experience (years)", 0.0, 30.0, 2.0, 0.5)
salary = st.number_input("Current Salary (INR)", 100000, 5000000, step=10000)
education_level = st.selectbox("Education Level", ["Graduate", "Masters", "Phd", "High School", "Other"])
company_size = st.selectbox("Company Size", ["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])
relevent_experience = st.selectbox("Relevant Experience", ["Yes", "No"])

2025-07-20 10:32:04.393 Session state does not function when running a script without `streamlit run`


In [20]:

# Encoding values
edu_map = {"High School": 0, "Graduate": 1, "Masters": 2, "Phd": 3, "Other": 4}
comp_map = {size: i for i, size in enumerate(["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])}
rel_exp_map = {"No": 0, "Yes": 1}

In [21]:
input_df = pd.DataFrame([{
    'gender': 1,  # dummy
    'relevent_experience': rel_exp_map[relevent_experience],
    'enrolled_university': 0,
    'education_level': edu_map[education_level],
    'major_discipline': 1,
    'company_size': comp_map[company_size],
    'company_type': 1,
    'last_new_job': 1,
    'city': 1,
    'experience_numeric': experience,
    'current_salary': salary
}])

In [22]:
if st.button("Predict"):
    job_change = job_model.predict(input_df)[0]
    if job_change == 1:
        growth = salary_model.predict(input_df)[0]
        new_salary = salary * (1 + growth / 100)
        st.success("This person is likely to change jobs.")
        st.write(f"Estimated Salary Growth: **{growth:.2f}%**")
        st.write(f"New Estimated Salary: ₹{new_salary:,.0f}")
    else:
        st.warning("Unlikely to change jobs.")



In [23]:
with open("requirements.txt", "w") as f:
    f.write("streamlit\nscikit-learn\npandas\nnumpy")

print("requirements.txt created")

requirements.txt created


In [24]:
app_code = '''
import streamlit as st
import pickle
import pandas as pd
import gzip
import json

# Load the compressed models
with gzip.open("job_change_model.pkl.gz", "rb") as f:
    job_model = pickle.load(f)

with gzip.open("salary_growth_model.pkl.gz", "rb") as f:
    salary_model = pickle.load(f)

# Load feature names
with open("feature_list.json", "r") as f:
    feature_list = json.load(f)

st.set_page_config(page_title="Job Change and Salary Predictor")
st.title("Job Change and Salary Prediction Web App")

# Input form
experience = st.slider("Experience (years)", 0.0, 30.0, 2.0, 0.5)
salary = st.number_input("Current Salary (INR)", 100000, 5000000, step=10000)
education_level = st.selectbox("Education Level", ["Graduate", "Masters", "Phd", "High School", "Other"])
company_size = st.selectbox("Company Size", ["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])
relevent_experience = st.selectbox("Relevant Experience", ["Yes", "No"])

# Encoding values (These should match the encoding used in the notebook)
edu_map = {"High School": 0, "Graduate": 1, "Masters": 2, "Phd": 3, "Other": 4}
comp_map = {size: i for i, size in enumerate(["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])}
rel_exp_map = {"No": 0, "Yes": 1}

if st.button("Predict"):
    # Create input DataFrame with all features, using dummy values for those not in the form
    input_data = {
        'city': 1, # Dummy value
        'city_development_index': 0.8, # Dummy value
        'gender': 1,  # Dummy value
        'relevent_experience': rel_exp_map[relevent_experience],
        'enrolled_university': 0, # Dummy value
        'education_level': edu_map[education_level],
        'major_discipline': 1, # Dummy value
        'company_size': comp_map[company_size],
        'company_type': 1, # Dummy value
        'last_new_job': 1, # Dummy value
        'experience_numeric': experience,
        'current_salary': salary
    }

    input_df = pd.DataFrame([input_data])

    # Ensure the column order matches the training data
    input_df = input_df[feature_list]

    job_change = job_model.predict(input_df)[0]

    if job_change == 1:
        growth = salary_model.predict(input_df)[0]
        new_salary = salary * (1 + growth / 100)
        st.success("This person is likely to change jobs.")
        st.write(f"Estimated Salary Growth: **{growth:.2f}%**")
        st.write(f"New Estimated Salary: ₹{new_salary:,.0f}")
    else:
        st.warning("Unlikely to change jobs.")
'''
with open("app.py", "w") as f:
    f.write(app_code)

print("app.py created with all features")

app.py created with all features


In [25]:

import streamlit as st
import pandas as pd
import gzip
import pickle

# Load the compressed model
with gzip.open("job_change_model.pkl.gz", "rb") as f:
    model = pickle.load(f)

st.set_page_config(page_title="Salary Predictor")
st.title("Employee Salary Predictor")

experience = st.slider("Years of Experience", 0, 30, 2)

if st.button("Predict Salary"):
    input_df = pd.DataFrame([[experience]], columns=["experience"])
    prediction = model.predict(input_df)[0]
    st.success(f"Estimated Salary: ₹{int(prediction):,}")




In [26]:
features = ['education_level', 'relevent_experience', 'company_size', 'experience_numeric', 'current_salary']

# Save to a file
import json
with open("feature_list.json", "w") as f:
    json.dump(features, f)


In [27]:
reg = RandomForestRegressor()
# Drop rows with NaN in yr_train
Xr_train, yr_train = Xr_train[~yr_train.isna()], yr_train[~yr_train.isna()]
reg.fit(Xr_train, yr_train)
# Drop rows with NaN in yr_test
Xr_test, yr_test = Xr_test[~yr_test.isna()], yr_test[~yr_test.isna()]
yr_pred = reg.predict(Xr_test)
print("\nMAE Salary Growth %:", mean_absolute_error(yr_test, yr_pred))


MAE Salary Growth %: 5.138369405164441


In [28]:
!pip install pyngrok



In [29]:
from pyngrok import ngrok
import os

# Terminate open tunnels if any
ngrok.kill()

# Get ngrok authtoken from environment variables or Colab secrets
# Assumes you have saved your ngrok authtoken in Colab secrets as 'NGROK_AUTH_TOKEN'
# You can get your authtoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN") or "308P5ZfGvOYidFoMrTTD8S81j8C_4zJE9NqPNBLGkofbTEaVC"

if NGROK_AUTH_TOKEN == "308P5ZfGvOYidFoMrTTD8S81j8C_4zJE9NqPNBLGkofbTEaVC":
  print("Please add your ngrok authtoken to Colab secrets or environment variables!")
else:
  ngrok.set_auth_token(NGROK_AUTH_TOKEN)
  # Open a tunnel to the Streamlit port
  public_url = ngrok.connect(8501)
  print(f" * ngrok tunnel available at: {public_url}")

Please add your ngrok authtoken to Colab secrets or environment variables!


In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd # Assuming pandas is needed for isna()
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

# Code from cell XYM2pLBaUlwZ to define df
df = pd.read_csv("aug_train.csv")

# Code from cell WXsBQDtRUpgx to drop rows with missing target
df.dropna(subset=['target'], inplace=True)

# Code from cell JkMVobydUr3a and Ia4NLKgJUuyH to add simulated salary
def salary_simulation(row):
    exp = row['experience']
    if exp == '<1':
        exp = 0.5
    elif exp == '>20':
        exp = 21
    else:
        try:
            exp = float(exp)
        except:
            exp = 2
    base = 3 + exp * np.random.uniform(0.4, 1.0)
    return round(base * 1e5, -3)

df['experience_numeric'] = df['experience'].replace({'<1': 0.5, '>20': 21}).astype(float)
df['current_salary'] = df.apply(salary_simulation, axis=1)
df['expected_salary'] = df['current_salary'] * df['experience_numeric'].apply(lambda x: 1 + np.random.uniform(0.15, 0.35))
df['salary_growth_percent'] = ((df['expected_salary'] - df['current_salary']) / df['current_salary']) * 100

# Code from cell FemB3adAU1o3 to encode categorical features and drop columns
cat_cols = ['gender', 'relevent_experience', 'enrolled_university', 'education_level',
            'major_discipline', 'company_size', 'company_type', 'last_new_job', 'city']

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Drop original 'experience' column and other unnecessary columns before splitting
df.drop(['enrollee_id', 'experience', 'training_hours'], axis=1, inplace=True)


# Code from cell 8ecDqyt7VCgb to define Xr_train, yr_train, Xr_test, yr_test
df_reg = df[df['target'] == 1]
Xr = df_reg.drop(['target', 'expected_salary', 'salary_growth_percent'], axis=1)
yr = df_reg['salary_growth_percent']
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)


reg = RandomForestRegressor()
# Drop rows with NaN in yr_train
Xr_train_cleaned, yr_train_cleaned = Xr_train[~yr_train.isna()], yr_train[~yr_train.isna()]
reg.fit(Xr_train_cleaned, yr_train_cleaned)
# Drop rows with NaN in yr_test
Xr_test_cleaned, yr_test_cleaned = Xr_test[~Xr_test.isna().any(axis=1)], yr_test[~Xr_test.isna().any(axis=1)]
yr_pred = reg.predict(Xr_test_cleaned)
print("\nMAE Salary Growth %:", mean_absolute_error(yr_test_cleaned, yr_pred))


MAE Salary Growth %: 5.026992016665521


In [32]:
!wget -q -O - ipv4.icanhazip.com

34.132.47.176


In [None]:
! streamlit run app.py & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.132.47.176:8501[0m
[0m
[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0Kyour url is: https://every-carrots-dance.loca.lt
[31m──[0m[31m────────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────────[0m[31m──[0m
[31m [0m [2;33m/usr/local/lib/python3.11/dist-packages/streamlit/runtime/scriptrunner/[0m[1;33mexec_code.py[0m: [31m [0m
[31m [0m [94m128[0m in [92mexec_func_with_error_handling[0m                                                 [31m [0m
[31m [0m           

In [None]:
app_code = '''
import streamlit as st
import pickle
import pandas as pd
import gzip
import json

# Load the compressed models
with gzip.open("job_change_model.pkl.gz", "rb") as f:
    job_model = pickle.load(f)

with gzip.open("salary_growth_model.pkl.gz", "rb") as f:
    salary_model = pickle.load(f)

# Load feature names
with open("feature_list.json", "r") as f:
    feature_list = json.load(f)

st.set_page_config(page_title="Job Change and Salary Predictor")
st.title("Job Change and Salary Prediction Web App")

# Input form
experience = st.slider("Experience (years)", 0.0, 30.0, 2.0, 0.5)
salary = st.number_input("Current Salary (INR)", 100000, 5000000, step=10000)
education_level = st.selectbox("Education Level", ["Graduate", "Masters", "Phd", "High School", "Other"])
company_size = st.selectbox("Company Size", ["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])
relevent_experience = st.selectbox("Relevant Experience", ["Yes", "No"])

# Encoding values (These should match the encoding used in the notebook)
edu_map = {"High School": 0, "Graduate": 1, "Masters": 2, "Phd": 3, "Other": 4}
comp_map = {size: i for i, size in enumerate(["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])}
rel_exp_map = {"No": 0, "Yes": 1}

if st.button("Predict"):
    # Create input DataFrame with all features, using dummy values for those not in the form
    input_data = {
        'city': 1, # Dummy value
        'city_development_index': 0.8, # Dummy value
        'gender': 1,  # Dummy value
        'relevent_experience': rel_exp_map[relevent_experience],
        'enrolled_university': 0, # Dummy value
        'education_level': edu_map[education_level],
        'major_discipline': 1, # Dummy value
        'company_size': comp_map[company_size],
        'company_type': 1, # Dummy value
        'last_new_job': 1, # Dummy value
        'experience_numeric': experience,
        'current_salary': salary
    }

    input_df = pd.DataFrame([input_data])

    # Ensure the column order matches the training data
    input_df = input_df[feature_list]

    job_change = job_model.predict(input_df)[0]

    if job_change == 1:
        growth = salary_model.predict(input_df)[0]
        new_salary = salary * (1 + growth / 100)
        st.success("This person is likely to change jobs.")
        st.write(f"Estimated Salary Growth: **{growth:.2f}%**")
        st.write(f"New Estimated Salary: ₹{new_salary:,.0f}")
    else:
        st.warning("Unlikely to change jobs.")
'''
with open("app.py", "w") as f:
    f.write(app_code)

print("app.py created with all features")