In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error,mean_squared_error


In [10]:
# Step 1: Load the data
data = pd.read_csv('employee.csv')

In [11]:
# Step 2: Preprocessing
# Handle missing values
data.fillna(method='ffill', inplace=True)
# Label Encoding for binary categorical variables
label_encoder = LabelEncoder()
binary_categorical_cols = ['is_manager', 'is_education_computer_related', 'certifications']
for col in binary_categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# One-Hot Encoding
data = pd.get_dummies(data, columns=['country', 'employment_status', 'job_title', 'education'])

# Convert timestamp to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['year'] = data['timestamp'].dt.year
data['month'] = data['timestamp'].dt.month
data['day'] = data['timestamp'].dt.day

# Normalize numerical features
scaler = StandardScaler()
num_features = ['job_years', 'hours_per_week']
data[num_features] = scaler.fit_transform(data[num_features])


  data.fillna(method='ffill', inplace=True)


In [12]:
# Step 3: Feature Selection
features = ['is_manager', 'is_education_computer_related', 'certifications', 'year', 'month', 'day'] + list(data.columns[data.columns.str.startswith('country_')])

In [13]:
# Step 4: Split Data
X = data[features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#Step 5:Linear Regression
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predictions with the Linear Regression model
y_pred_linear = linear_model.predict(X_test)

#MAE & MSE for Linear Regression
mae_linear = mean_absolute_error(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)

print("Linear Regression Metrics:")
print(f"Mean Absolute Error (MAE): {mae_linear}")
print(f"Mean Squared Error (MSE): {mse_linear}")

Linear Regression Metrics:
Mean Absolute Error (MAE): 341361119849.2263
Mean Squared Error (MSE): 2.4936910852590504e+25


In [15]:
#Step 6:Ridge Regression&Lasso Regression model
from sklearn.linear_model import Ridge, Lasso

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Predictions with Ridge Regression model
y_pred_ridge = ridge_model.predict(X_test)

# MAE & MSE for Ridge Regression
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print("Ridge Regression Metrics:")
print(f"Mean Absolute Error (MAE): {mae_ridge}")
print(f"Mean Squared Error (MSE): {mse_ridge}")


lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

# Prediction with the Lasso Regression model
y_pred_lasso = lasso_model.predict(X_test)

# MAE & MSE for Lasso Regression
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print("Lasso Regression Metrics:")
print(f"Mean Absolute Error (MAE): {mae_lasso}")
print(f"Mean Squared Error (MSE): {mse_lasso}")

Ridge Regression Metrics:
Mean Absolute Error (MAE): 909.3693701091759
Mean Squared Error (MSE): 1485196.6765126046
Lasso Regression Metrics:
Mean Absolute Error (MAE): 909.2216238008331
Mean Squared Error (MSE): 1486483.443935434


In [None]:
#The values of MAE and MSE for are Ridge Regression and Lasso Regression are much smaller than those of Linear regression. It means these two regression metrics perform better than Linear Regression.