<a href="https://colab.research.google.com/github/antonychackotc/project-4/blob/main/1st_4m_final_future_pridiction_%26_streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Final Future Prediction**

In [10]:
import pandas as pd
import numpy as np
import joblib

# Load saved transformations & model
log_transformer_total = joblib.load("log_transformer_TotalWorkingYears.pkl")
log_transformer_year = joblib.load("log_transformer_YearsAtCompany.pkl")
label_encoders = joblib.load("label_encoders.pkl")
ohe_columns = joblib.load("ohe_columns.pkl")
freq_encoding_map_job = joblib.load("jobrole_freq_encoding.pkl")
freq_encoding_map_edu = joblib.load("EducationField_freq_encoding.pkl")

# Load trained Random Forest model
loaded_model = joblib.load("random_forest_best_1st.pkl")

# Function to categorize tenure
def tenure_category(years):
    if years < 2:
        return 'New'
    elif years < 5:
        return 'Intermediate'
    elif years < 10:
        return 'Experienced'
    else:
        return 'Veteran'

# ✅ Example new input data
new_data = pd.DataFrame({
    'MonthlyIncome': [5993],
    'TotalWorkingYears': [8],
    'OverTime': ['Yes'],  # Categorical
    'YearsAtCompany': [6],
    'Age': [41],
    'JobLevel': [2],
    'Department': ['Sales'],  # Categorical
    'JobRole': ['Sales Executive'],  # Categorical
    'StockOptionLevel': [0],
    'EducationField': ['Life Sciences'],  # Categorical
    'JobInvolvement': [3]
})

### **Step 1: Apply Preprocessing**
# Ensure categorical values match training format
new_data['Department'] = new_data['Department'].replace({
    'Research & Development': 'Research_and_Development',
    'Sales': 'Sales'
})

new_data['EducationField'] = new_data['EducationField'].replace({
    'Life Sciences': 'Life_Sciences'
})

# Log transformation
new_data['TotalWorkingYears'] = log_transformer_total.transform(new_data[['TotalWorkingYears']])
new_data['YearsAtCompany'] = log_transformer_year.transform(new_data[['YearsAtCompany']])

# Tenure category transformation
new_data['Tenure_Category'] = new_data['YearsAtCompany'].apply(tenure_category)

# ✅ FIX: Encode `Tenure_Category` properly
if 'Tenure_Category' in label_encoders:
    # If label encoding was used
    new_data['Tenure_Category'] = label_encoders['Tenure_Category'].transform(new_data['Tenure_Category'])
else:
    # If one-hot encoding was used
    new_data = pd.get_dummies(new_data, columns=['Tenure_Category'], drop_first=True)

# Frequency encoding
new_data['JobRole_FreqEncoded'] = new_data['JobRole'].map(freq_encoding_map_job).fillna(0)
new_data['EducationField_FreqEncoded'] = new_data['EducationField'].map(freq_encoding_map_edu).fillna(0)

# Label Encoding (for OverTime)
new_data['OverTime'] = label_encoders['OverTime'].transform(new_data['OverTime'])

# One-Hot Encoding for Department
new_data_ohe = pd.get_dummies(new_data, columns=['Department'], drop_first=True)

# Ensure all expected columns are present
for col in ohe_columns:
    if col not in new_data_ohe:
        new_data_ohe[col] = 0  # Add missing columns with value 0

# Add missing columns if they were used during training
required_features = [
    'MonthlyIncome', 'TotalWorkingYears', 'OverTime', 'YearsAtCompany', 'Age',
    'JobLevel', 'Department_Research_and_Development', 'Department_Sales',
    'Tenure_Category', 'JobRole_FreqEncoded', 'StockOptionLevel',
    'EducationField_FreqEncoded', 'JobInvolvement'
]

for col in required_features:
    if col not in new_data_ohe:
        new_data_ohe[col] = 0  # Ensure required columns exist

# Reorder columns to match training data
new_data_ohe = new_data_ohe[required_features]

### **Step 2: Make Prediction**
prediction = loaded_model.predict(new_data_ohe)

if prediction[0] == 1:
    print("Predicted Output: [1] The employee is likely to leave (Attrition: Yes).")
else:
    print("Predicted Output: [0] The employee is likely to stay (Attrition: No).")

Predicted Output: [1] The employee is likely to leave (Attrition: Yes).


In [11]:
import pandas as pd
import numpy as np
import joblib

# Load saved transformations & model
log_transformer_total = joblib.load("log_transformer_TotalWorkingYears.pkl")
log_transformer_year = joblib.load("log_transformer_YearsAtCompany.pkl")
label_encoders = joblib.load("label_encoders.pkl")
ohe_columns = joblib.load("ohe_columns.pkl")
freq_encoding_map_job = joblib.load("jobrole_freq_encoding.pkl")
freq_encoding_map_edu = joblib.load("EducationField_freq_encoding.pkl")

# Load trained Random Forest model
loaded_model = joblib.load("random_forest_best_1st.pkl")

# Function to categorize tenure
def tenure_category(years):
    if years < 2:
        return 'New'
    elif years < 5:
        return 'Intermediate'
    elif years < 10:
        return 'Experienced'
    else:
        return 'Veteran'

# ✅ Example new input data
new_data = pd.DataFrame({
    'MonthlyIncome': [2426],
    'TotalWorkingYears': [6],
    'OverTime': ['No'],  # Categorical
    'YearsAtCompany': [5],
    'Age': [35],
    'JobLevel': [1],
    'Department': ['Research & Development'],  # Categorical
    'JobRole': ['Laboratory Technician'],  # Categorical
    'StockOptionLevel': [1],
    'EducationField': ['Medical'],  # Categorical
    'JobInvolvement': [4]
})

### **Step 1: Apply Preprocessing**
# Ensure categorical values match training format
new_data['Department'] = new_data['Department'].replace({
    'Research & Development': 'Research_and_Development',
    'Sales': 'Sales'
})

new_data['EducationField'] = new_data['EducationField'].replace({
    'Life Sciences': 'Life_Sciences'
})

# Log transformation
new_data['TotalWorkingYears'] = log_transformer_total.transform(new_data[['TotalWorkingYears']])
new_data['YearsAtCompany'] = log_transformer_year.transform(new_data[['YearsAtCompany']])

# Tenure category transformation
new_data['Tenure_Category'] = new_data['YearsAtCompany'].apply(tenure_category)

# ✅ FIX: Encode `Tenure_Category` properly
if 'Tenure_Category' in label_encoders:
    # If label encoding was used
    new_data['Tenure_Category'] = label_encoders['Tenure_Category'].transform(new_data['Tenure_Category'])
else:
    # If one-hot encoding was used
    new_data = pd.get_dummies(new_data, columns=['Tenure_Category'], drop_first=True)

# Frequency encoding
new_data['JobRole_FreqEncoded'] = new_data['JobRole'].map(freq_encoding_map_job).fillna(0)
new_data['EducationField_FreqEncoded'] = new_data['EducationField'].map(freq_encoding_map_edu).fillna(0)

# Label Encoding (for OverTime)
new_data['OverTime'] = label_encoders['OverTime'].transform(new_data['OverTime'])

# One-Hot Encoding for Department
new_data_ohe = pd.get_dummies(new_data, columns=['Department'], drop_first=True)

# Ensure all expected columns are present
for col in ohe_columns:
    if col not in new_data_ohe:
        new_data_ohe[col] = 0  # Add missing columns with value 0

# Add missing columns if they were used during training
required_features = [
    'MonthlyIncome', 'TotalWorkingYears', 'OverTime', 'YearsAtCompany', 'Age',
    'JobLevel', 'Department_Research_and_Development', 'Department_Sales',
    'Tenure_Category', 'JobRole_FreqEncoded', 'StockOptionLevel',
    'EducationField_FreqEncoded', 'JobInvolvement'
]

for col in required_features:
    if col not in new_data_ohe:
        new_data_ohe[col] = 0  # Ensure required columns exist

# Reorder columns to match training data
new_data_ohe = new_data_ohe[required_features]

### **Step 2: Make Prediction**
prediction = loaded_model.predict(new_data_ohe)

if prediction[0] == 1:
    print("Predicted Output: [1] The employee is likely to leave (Attrition: Yes).")
else:
    print("Predicted Output: [0] The employee is likely to stay (Attrition: No).")

Predicted Output: [0] The employee is likely to stay (Attrition: No).


# **Prediction Successfully Work**

In [12]:
#################################################################################################

# **Streamlit Application**

In [13]:
!pip install -q streamlit
!pip install -q localtunnel
!pip install -q pyngrok

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement localtunnel (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for localtunnel[0m[31m
[0m

In [14]:
%%writefile app1.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib

# Load saved transformations & model
log_transformer_total = joblib.load("log_transformer_TotalWorkingYears.pkl")
log_transformer_year = joblib.load("log_transformer_YearsAtCompany.pkl")
label_encoders = joblib.load("label_encoders.pkl")
ohe_columns = joblib.load("ohe_columns.pkl")
freq_encoding_map_job = joblib.load("jobrole_freq_encoding.pkl")
freq_encoding_map_edu = joblib.load("EducationField_freq_encoding.pkl")
loaded_model = joblib.load("random_forest_best_1st.pkl")

# Function to categorize tenure
def tenure_category(years):
    if years < 2:
        return 'New'
    elif years < 5:
        return 'Intermediate'
    elif years < 10:
        return 'Experienced'
    else:
        return 'Veteran'

# Streamlit App UI
st.title("Employee Attrition Prediction")
st.write("Enter employee details to predict if they will leave or stay.")

# User input fields
monthly_income = st.number_input("Monthly Income", min_value=1000, max_value=20000, value=6000)
total_working_years = st.number_input("Total Working Years", min_value=0, max_value=40, value=10)
overtime = st.selectbox("OverTime", ['Yes', 'No'])
years_at_company = st.number_input("Years At Company", min_value=0, max_value=40, value=5)
age = st.number_input("Age", min_value=18, max_value=65, value=30)
job_level = st.selectbox("Job Level", [1, 2, 3, 4, 5])
department = st.selectbox("Department", ['Research & Development', 'Sales', 'Human Resources'])
job_role = st.selectbox("Job Role", list(freq_encoding_map_job.keys()))
stock_option_level = st.selectbox("Stock Option Level", [0, 1, 2, 3])
education_field = st.selectbox("Education Field", list(freq_encoding_map_edu.keys()))
job_involvement = st.selectbox("Job Involvement", [1, 2, 3, 4])

if st.button("Predict Attrition"):
    # Create DataFrame from input
    new_data = pd.DataFrame({
        'MonthlyIncome': [monthly_income],
        'TotalWorkingYears': [total_working_years],
        'OverTime': [overtime],
        'YearsAtCompany': [years_at_company],
        'Age': [age],
        'JobLevel': [job_level],
        'Department': [department],
        'JobRole': [job_role],
        'StockOptionLevel': [stock_option_level],
        'EducationField': [education_field],
        'JobInvolvement': [job_involvement]
    })

    # Data Preprocessing
    new_data['Department'] = new_data['Department'].replace({'Research & Development': 'Research_and_Development'})
    new_data['TotalWorkingYears'] = log_transformer_total.transform(new_data[['TotalWorkingYears']])
    new_data['YearsAtCompany'] = log_transformer_year.transform(new_data[['YearsAtCompany']])
    new_data['Tenure_Category'] = new_data['YearsAtCompany'].apply(tenure_category)

    if 'Tenure_Category' in label_encoders:
        new_data['Tenure_Category'] = label_encoders['Tenure_Category'].transform(new_data['Tenure_Category'])
    else:
        new_data = pd.get_dummies(new_data, columns=['Tenure_Category'], drop_first=True)

    new_data['JobRole_FreqEncoded'] = new_data['JobRole'].map(freq_encoding_map_job).fillna(0)
    new_data['EducationField_FreqEncoded'] = new_data['EducationField'].map(freq_encoding_map_edu).fillna(0)
    new_data['OverTime'] = label_encoders['OverTime'].transform(new_data['OverTime'])
    new_data_ohe = pd.get_dummies(new_data, columns=['Department'], drop_first=True)

    for col in ohe_columns:
        if col not in new_data_ohe:
            new_data_ohe[col] = 0

    required_features = ['MonthlyIncome', 'TotalWorkingYears', 'OverTime', 'YearsAtCompany', 'Age',
                         'JobLevel', 'Department_Research_and_Development', 'Department_Sales',
                         'Tenure_Category', 'JobRole_FreqEncoded', 'StockOptionLevel',
                         'EducationField_FreqEncoded', 'JobInvolvement']

    for col in required_features:
        if col not in new_data_ohe:
            new_data_ohe[col] = 0

    new_data_ohe = new_data_ohe[required_features]

    # Make Prediction
    prediction = loaded_model.predict(new_data_ohe)
    result = "The employee is likely to leave (Attrition: Yes)." if prediction[0] == 1 else "The employee is likely to stay (Attrition: No)."

    st.subheader("Prediction Result")
    st.success(result)


Writing app1.py


In [15]:
from pyngrok import ngrok

# Replace 'YOUR_AUTHTOKEN' with your actual ngrok authtoken
ngrok.set_auth_token("2t49imFFYgswbiILhxmRavW6AlI_5a5SfwjpV9f29CtGGGYiu")

# Run the Streamlit app in the background
!streamlit run app1.py &>/dev/null&

# Create a public URL using ngrok
try:
    public_url = ngrok.connect(8501)
    print(f"Streamlit app is running at {public_url}")
except Exception as e:
    print(f"Error: {e}")
    print("Trying to run with localtunnel")
    !streamlit run app1.py &>/content/logs.txt & npx localtunnel --port 8501

Streamlit app is running at NgrokTunnel: "https://1b45-34-125-155-82.ngrok-free.app" -> "http://localhost:8501"
