<a href="https://colab.research.google.com/github/aniketverma-14/Colab-Whisper-Transcriber-AI/blob/main/Employee_Job_Change_%26_Salary_Growth_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
from google.colab import files
uploaded = files.upload()

Saving aug_train.csv to aug_train.csv


In [22]:
import pandas as pd
import numpy as np
import pickle, gzip, shutil
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, mean_absolute_error

In [23]:
df = pd.read_csv("aug_train.csv")

In [24]:
# Drop rows with missing target
df.dropna(subset=['target'], inplace=True)

In [25]:
# Add simulated salary based on experience
def salary_simulation(row):
    exp = row['experience']
    if exp == '<1':
        exp = 0.5
    elif exp == '>20':
        exp = 21
    else:
        try:
            exp = float(exp)
        except:
            exp = 2
    base = 3 + exp * np.random.uniform(0.4, 1.0)
    return round(base * 1e5, -3)

In [26]:
df['experience_numeric'] = df['experience'].replace({'<1': 0.5, '>20': 21}).astype(float)
df['current_salary'] = df.apply(salary_simulation, axis=1)
df['expected_salary'] = df['current_salary'] * df['experience_numeric'].apply(lambda x: 1 + np.random.uniform(0.15, 0.35))
df['salary_growth_percent'] = ((df['expected_salary'] - df['current_salary']) / df['current_salary']) * 100

In [27]:
cat_cols = ['gender', 'relevent_experience', 'enrolled_university', 'education_level',
            'major_discipline', 'company_size', 'company_type', 'last_new_job', 'city']

In [29]:
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

df.drop(['enrollee_id', 'experience', 'training_hours'], axis=1, inplace=True)

In [30]:
Xc = df.drop(['target', 'expected_salary', 'salary_growth_percent'], axis=1)
yc = df['target']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42)

In [31]:
clf = RandomForestClassifier()
clf.fit(Xc_train, yc_train)
print("Classification Report:\n", classification_report(yc_test, clf.predict(Xc_test)))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.89      0.86      2880
         1.0       0.57      0.45      0.50       952

    accuracy                           0.78      3832
   macro avg       0.70      0.67      0.68      3832
weighted avg       0.77      0.78      0.77      3832



In [32]:
df_reg = df[df['target'] == 1]
Xr = df_reg.drop(['target', 'expected_salary', 'salary_growth_percent'], axis=1)
yr = df_reg['salary_growth_percent']
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)

In [35]:
reg = RandomForestRegressor()
# Drop rows with NaN in yr_train
Xr_train, yr_train = Xr_train[~yr_train.isna()], yr_train[~yr_train.isna()]
reg.fit(Xr_train, yr_train)
# Drop rows with NaN in yr_test
Xr_test, yr_test = Xr_test[~yr_test.isna()], yr_test[~yr_test.isna()]
yr_pred = reg.predict(Xr_test)
print("\nMAE Salary Growth %:", mean_absolute_error(yr_test, yr_pred))


MAE Salary Growth %: 4.99247155040975


In [36]:
# Save and compress models
with open("job_change_model.pkl", "wb") as f:
    pickle.dump(clf, f)
with open("salary_growth_model.pkl", "wb") as f:
    pickle.dump(reg, f)

In [37]:
# Compress
with open("job_change_model.pkl", 'rb') as f_in, gzip.open("job_change_model.pkl.gz", 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [38]:
with open("salary_growth_model.pkl", 'rb') as f_in, gzip.open("salary_growth_model.pkl.gz", 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [39]:
# Download
from google.colab import files
files.download("job_change_model.pkl.gz")
files.download("salary_growth_model.pkl.gz")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
!pip install streamlit
import streamlit as st
import pandas as pd
import gzip, pickle



In [43]:
# Load models
with gzip.open("job_change_model.pkl.gz", "rb") as f:
    job_model = pickle.load(f)

with gzip.open("salary_growth_model.pkl.gz", "rb") as f:
    salary_model = pickle.load(f)

st.title("Job Change & Salary Growth Predictor")

2025-07-19 06:23:31.840 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [44]:
# Input form
experience = st.slider("Experience (years)", 0.0, 30.0, 2.0, 0.5)
salary = st.number_input("Current Salary (INR)", 100000, 5000000, step=10000)
education_level = st.selectbox("Education Level", ["Graduate", "Masters", "Phd", "High School", "Other"])
company_size = st.selectbox("Company Size", ["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])
relevent_experience = st.selectbox("Relevant Experience", ["Yes", "No"])

2025-07-19 06:23:43.686 Session state does not function when running a script without `streamlit run`


In [45]:
# Encoding values
edu_map = {"High School": 0, "Graduate": 1, "Masters": 2, "Phd": 3, "Other": 4}
comp_map = {size: i for i, size in enumerate(["<10", "10-49", "50-99", "100-500", "500-999", "1000-4999", "5000-9999", "10000+"])}
rel_exp_map = {"No": 0, "Yes": 1}

In [46]:
input_df = pd.DataFrame([{
    'gender': 1,  # dummy
    'relevent_experience': rel_exp_map[relevent_experience],
    'enrolled_university': 0,
    'education_level': edu_map[education_level],
    'major_discipline': 1,
    'company_size': comp_map[company_size],
    'company_type': 1,
    'last_new_job': 1,
    'city': 1,
    'experience_numeric': experience,
    'current_salary': salary
}])

In [47]:
if st.button("Predict"):
    job_change = job_model.predict(input_df)[0]
    if job_change == 1:
        growth = salary_model.predict(input_df)[0]
        new_salary = salary * (1 + growth / 100)
        st.success("This person is likely to change jobs.")
        st.write(f"Estimated Salary Growth: **{growth:.2f}%**")
        st.write(f"New Estimated Salary: ₹{new_salary:,.0f}")
    else:
        st.warning("Unlikely to change jobs.")



In [48]:
with open("requirements.txt", "w") as f:
    f.write("streamlit\nscikit-learn\npandas\nnumpy")

print("requirements.txt created")

requirements.txt created


In [49]:
app_code = '''
import streamlit as st
import pickle
import pandas as pd

model = pickle.load(open("job_change_model.pkl", "rb"))

st.set_page_config(page_title="Salary Predictor")
st.title( Salary Prediction Web App")

experience = st.slider("Years of Experience", 0, 30, 2)

if st.button("Predict Salary"):
    df = pd.DataFrame([[experience]], columns=["experience"])
    prediction = model.predict(df)
    st.success(f"Estimated Salary: ₹{int(prediction[0]):,}")
'''
with open("app.py", "w") as f:
    f.write(app_code)

print("app.py created")


app.py created
