In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.metrics import mean_squared_error, r2_score


In [11]:
df = pd.read_csv("deepseek_csv_employee.csv")

In [12]:
df.head()

Unnamed: 0,age,gender,education_level,job_experience,income,bonus,work_hours_per_week,company_size,industry,stress_level,job_satisfaction,city_tier,commute_time_minutes,remote_work_ratio,days_since_last_promotion,attrition_risk
0,25,Male,Bachelor,2,45000,5000,45,Medium,Technology,7,6,2,60,0.2,180,0
1,32,Female,Master,5,65000,8000,42,Large,Finance,5,8,1,45,0.3,90,0
2,28,Female,Bachelor,3,50000,4000,40,Small,Healthcare,6,7,2,50,0.1,240,1
3,45,Male,PhD,15,120000,15000,50,Large,Technology,8,9,1,30,0.5,30,0
4,22,Male,High School,1,30000,2000,35,Small,Retail,4,5,3,75,0.0,365,1


In [13]:
df.dtypes

age                            int64
gender                        object
education_level               object
job_experience                 int64
income                         int64
bonus                          int64
work_hours_per_week            int64
company_size                  object
industry                      object
stress_level                   int64
job_satisfaction               int64
city_tier                      int64
commute_time_minutes           int64
remote_work_ratio            float64
days_since_last_promotion      int64
attrition_risk                 int64
dtype: object

In [14]:
df.isnull().sum()

age                          0
gender                       0
education_level              0
job_experience               0
income                       0
bonus                        0
work_hours_per_week          0
company_size                 0
industry                     0
stress_level                 0
job_satisfaction             0
city_tier                    0
commute_time_minutes         0
remote_work_ratio            0
days_since_last_promotion    0
attrition_risk               0
dtype: int64

In [15]:
df.nunique()

age                          26
gender                        2
education_level               4
job_experience               18
income                       30
bonus                        30
work_hours_per_week          17
company_size                  3
industry                      5
stress_level                  6
job_satisfaction              8
city_tier                     3
commute_time_minutes         29
remote_work_ratio             9
days_since_last_promotion    29
attrition_risk                2
dtype: int64

In [16]:
ohe = OneHotEncoder()
le = LabelEncoder()

In [17]:
df["gender"] = le.fit_transform(df["gender"])

In [18]:
df.head()

Unnamed: 0,age,gender,education_level,job_experience,income,bonus,work_hours_per_week,company_size,industry,stress_level,job_satisfaction,city_tier,commute_time_minutes,remote_work_ratio,days_since_last_promotion,attrition_risk
0,25,1,Bachelor,2,45000,5000,45,Medium,Technology,7,6,2,60,0.2,180,0
1,32,0,Master,5,65000,8000,42,Large,Finance,5,8,1,45,0.3,90,0
2,28,0,Bachelor,3,50000,4000,40,Small,Healthcare,6,7,2,50,0.1,240,1
3,45,1,PhD,15,120000,15000,50,Large,Technology,8,9,1,30,0.5,30,0
4,22,1,High School,1,30000,2000,35,Small,Retail,4,5,3,75,0.0,365,1


In [19]:
# df["education_level_encoded"] = le.fit_transform(df["education_level"].map({"High School":1, "Bachelor":2, "Masters":3, "PhD":4}))
education_map = {"High School":1, "Bachelor":2, "Master":3, "PhD":4}
df["education_level_encoded"] = df["education_level"].map(education_map)

In [20]:
company_size_map = {"Small":1, "Medium":2, "Large":3}
df["company_size-encoded"] = df["company_size"].map(company_size_map)

In [21]:
df["industry"].value_counts()

industry
Technology    9
Finance       7
Healthcare    6
Retail        4
Education     4
Name: count, dtype: int64

In [22]:
df["industry_encoded"] = le.fit_transform(df["industry"])

In [23]:
df.head()

Unnamed: 0,age,gender,education_level,job_experience,income,bonus,work_hours_per_week,company_size,industry,stress_level,job_satisfaction,city_tier,commute_time_minutes,remote_work_ratio,days_since_last_promotion,attrition_risk,education_level_encoded,company_size-encoded,industry_encoded
0,25,1,Bachelor,2,45000,5000,45,Medium,Technology,7,6,2,60,0.2,180,0,2,2,4
1,32,0,Master,5,65000,8000,42,Large,Finance,5,8,1,45,0.3,90,0,3,3,1
2,28,0,Bachelor,3,50000,4000,40,Small,Healthcare,6,7,2,50,0.1,240,1,2,1,2
3,45,1,PhD,15,120000,15000,50,Large,Technology,8,9,1,30,0.5,30,0,4,3,4
4,22,1,High School,1,30000,2000,35,Small,Retail,4,5,3,75,0.0,365,1,1,1,3


In [24]:
ss = StandardScaler()

In [25]:
df[["stress_level_scaled","work_hours_per_week_scaled","job_satisfaction_scaled","job_experience_scaled","days_since_last_promotion_scaled","remote_work_ratio_scaled"]] = ss.fit_transform(df[["stress_level","work_hours_per_week","job_satisfaction","job_experience","days_since_last_promotion","remote_work_ratio"]])

In [26]:
cols_to_scale = ["stress_level", "work_hours_per_week", "job_satisfaction", 
                 "job_experience", "days_since_last_promotion", "remote_work_ratio"]

for col in cols_to_scale:
    df[col+"_scaled"] = ss.fit_transform(df[[col]])

In [27]:
df.head()

Unnamed: 0,age,gender,education_level,job_experience,income,bonus,work_hours_per_week,company_size,industry,stress_level,...,attrition_risk,education_level_encoded,company_size-encoded,industry_encoded,stress_level_scaled,work_hours_per_week_scaled,job_satisfaction_scaled,job_experience_scaled,days_since_last_promotion_scaled,remote_work_ratio_scaled
0,25,1,Bachelor,2,45000,5000,45,Medium,Technology,7,...,0,2,2,4,0.713256,0.590199,-0.57735,-0.843997,-0.101731,-0.466644
1,32,0,Master,5,65000,8000,42,Large,Finance,5,...,0,3,3,1,-0.66724,-0.099644,0.57735,-0.372197,-0.834191,-0.042422
2,28,0,Bachelor,3,50000,4000,40,Small,Healthcare,6,...,1,2,1,2,0.023008,-0.559539,0.0,-0.68673,0.386576,-0.890865
3,45,1,PhD,15,120000,15000,50,Large,Technology,8,...,0,4,3,4,1.403505,1.739936,1.154701,1.200468,-1.322497,0.806021
4,22,1,High School,1,30000,2000,35,Small,Retail,4,...,1,1,1,3,-1.357488,-1.709277,-1.154701,-1.001263,1.403882,-1.315087


In [28]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
industry_encoded = ohe.fit_transform(df[["industry"]])
industry_df = pd.DataFrame(industry_encoded, columns=ohe.get_feature_names_out(["industry"]))

df = pd.concat([df, industry_df], axis=1)
df = df.drop("industry", axis=1)  # drop original string column




In [29]:
df.drop(["industry_encoded"], axis=1, inplace=True)

In [30]:
df.iloc[:, 15:30].head()

Unnamed: 0,education_level_encoded,company_size-encoded,stress_level_scaled,work_hours_per_week_scaled,job_satisfaction_scaled,job_experience_scaled,days_since_last_promotion_scaled,remote_work_ratio_scaled,industry_Education,industry_Finance,industry_Healthcare,industry_Retail,industry_Technology
0,2,2,0.713256,0.590199,-0.57735,-0.843997,-0.101731,-0.466644,0.0,0.0,0.0,0.0,1.0
1,3,3,-0.66724,-0.099644,0.57735,-0.372197,-0.834191,-0.042422,0.0,1.0,0.0,0.0,0.0
2,2,1,0.023008,-0.559539,0.0,-0.68673,0.386576,-0.890865,0.0,0.0,1.0,0.0,0.0
3,4,3,1.403505,1.739936,1.154701,1.200468,-1.322497,0.806021,0.0,0.0,0.0,0.0,1.0
4,1,1,-1.357488,-1.709277,-1.154701,-1.001263,1.403882,-1.315087,0.0,0.0,0.0,1.0,0.0


In [31]:
feature_columns = [
    'education_level_encoded',
    'company_size-encoded',
    'stress_level_scaled',
    'work_hours_per_week_scaled',
    'job_satisfaction_scaled',
    'job_experience_scaled',
    'days_since_last_promotion_scaled',
    'remote_work_ratio_scaled',
    'industry_Education',
    'industry_Finance',
    'industry_Healthcare',
    'industry_Retail',
    'industry_Technology'
]

In [32]:
x = df[feature_columns]
y_reg = df["income"]
y_clf = df["attrition_risk"]

In [33]:
x_treg, x_test_reg, y_treg, y_test_reg = train_test_split(x, y_reg, test_size=0.2, random_state=42)
x_tclf, x_test_clf, y_tclf, y_test_clf = train_test_split(x, y_clf, test_size=0.2, random_state=42)

In [34]:
log_reg = LogisticRegression()

In [35]:
log_model = log_reg.fit(x_tclf, y_tclf)
y_perd_clf = log_model.predict(x_test_clf)

In [36]:
lin_reg = LinearRegression()

In [37]:
lin_model = lin_reg.fit(x_treg,y_treg)

In [38]:
y_pred_lin = lin_model.predict(x_test_reg)

In [39]:


model = LogisticRegression(max_iter=1000)
model.fit(x_tclf, y_tclf)
print("Train accuracy:", model.score(x_tclf, y_tclf))
print("Test accuracy:", model.score(x_test_clf, y_test_clf))

Train accuracy: 1.0
Test accuracy: 1.0


In [41]:
print("\nRegression Model Performance:")
mse = mean_squared_error(y_test_reg, y_pred_lin)
r2 = r2_score(y_test_reg, y_pred_lin)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# Interpretation
print("\nInterpretation:")
print(f"- Your model's predictions are off by about ${rmse:.2f} on average")
print(f"- Your model explains {r2:.1%} of the variance in income")

# Additional context
target_std = np.std(y_test_reg)
print(f"- Target variable standard deviation: ${target_std:.2f}")
print(f"- RMSE is {rmse/target_std:.2f} standard deviations of the target")

# Check for overfitting/underfitting
train_r2 = r2_score(y_treg, lin_model.predict(x_treg))
print(f"\nModel Diagnostics:")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²: {r2:.4f}")

if train_r2 - r2 > 0.1:
    print("⚠️ Possible overfitting: Train R² significantly higher than Test R²")
elif r2 < 0.3:
    print("⚠️ Possible underfitting: Low R² on test set")
else:
    print("✅ Model appears to be well-fitted")


Regression Model Performance:
Mean Squared Error (MSE): 15415128.63
Root Mean Squared Error (RMSE): 3926.21
R-squared (R²): 0.93

Interpretation:
- Your model's predictions are off by about $3926.21 on average
- Your model explains 93.1% of the variance in income
- Target variable standard deviation: $14985.18
- RMSE is 0.26 standard deviations of the target

Model Diagnostics:
Train R²: 0.9965
Test R²: 0.9314
✅ Model appears to be well-fitted
