In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [12]:
data = pd.read_csv('/content/drive/Shareddrives/DATA240/Final_Project/Dataset/4_preprocessed_data_.csv')

In [13]:
data.head()

Unnamed: 0,Title,Company_Name,Location,Via,Min_Salary,Max_Salary,Programming_Languages,Visualization_Tools,Databases_and_Big_Data_Tools,ML_and_Data_Science_Libraries,Cloud_Services,Education,Experience,Soft_Skills,Work_Mode,job_position,level,State,Min_Salary_Outlier,Max_Salary_Outlier
0,Data Scientist,Apple,"Austin, TX",Careers At Apple,146975.0,197450.0,"sql, python",tableau,mysql,"tensorflow, pytorch, scikit-learn","docker, kubernetes",bachelor,5,"communication skills, collaboration, leadership",hybrid,Data Scientist,Unspecified,TX,False,False
1,Data Scientist,Select Minds LLC,"Dallas, TX",LinkedIn,146975.0,197450.0,"r, sql, python",tableau,spark,"tensorflow, pytorch, scikit-learn",aws,bachelor,3,communication skills,hybrid,Data Scientist,Unspecified,TX,False,False
2,Principal Data Scientist,MD Anderson,"Houston, TX",MD Anderson - MD Anderson Cancer Center,132728.0,234000.0,python,tableau,spark,"tensorflow, pytorch, scikit-learn",aws,bachelor,5,"decision-making, collaboration, collaboration,...",remote,Data Scientist,Senior-level,TX,False,False
3,Data Scientist - Causal Discovery,CPChem,"The Woodlands, TX",Careers At Chevron Phillips - Chevron Phillips...,146975.0,197450.0,"go, r, sql, python",tableau,spark,"tensorflow, pytorch, scikit-learn","microsoft azure, aws",master,3,"communication skills, decision-making",hybrid,Data Scientist,Unspecified,TX,False,False
4,Senior Data Scientist-Remote,Baker Hughes,"Houston, TX (+1 other)",Baker Hughes,104000.0,150000.0,"scala, r, python",tableau,spark,"tensorflow, keras",aws,master,5,communication skills,remote,Data Scientist,Senior-level,TX,False,False


In [14]:
data.drop(['Location'], axis=1, inplace=True)

In [15]:
# Check for missing values in each column of the dataset
missing_values = data.isnull().sum()

# Display the columns with their respective counts of missing values
missing_values

Unnamed: 0,0
Title,0
Company_Name,0
Via,0
Min_Salary,0
Max_Salary,0
Programming_Languages,0
Visualization_Tools,0
Databases_and_Big_Data_Tools,0
ML_and_Data_Science_Libraries,0
Cloud_Services,0


In [16]:
from sklearn.multioutput import MultiOutputRegressor

# Filter necessary columns for both Min and Max Salary
selected_columns = [
    "Programming_Languages",
    "Visualization_Tools",
    "Databases_and_Big_Data_Tools",
    "ML_and_Data_Science_Libraries",
    'Cloud_Services',
    "Education",
    "Experience",
    "State",
    "job_position",
    "level",
    "Work_Mode",
    "Min_Salary",
    "Max_Salary"
]

In [17]:
# Separate features and target
X = data.drop(columns=["Min_Salary", "Max_Salary"])
y = data[["Min_Salary", "Max_Salary"]]

In [18]:
# Preprocess categorical and numerical features
categorical_features = [
    "Programming_Languages",
    "Visualization_Tools",
    "Databases_and_Big_Data_Tools",
    "ML_and_Data_Science_Libraries",
    'Cloud_Services',
    "Education",
    "State",
    "job_position",
    "level",
    "Work_Mode"
]
numerical_features = ["Experience"]



In [19]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numerical_features)
    ]
)

In [20]:
# Define a pipeline with MultiOutput Random Forest Regressor
model_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", MultiOutputRegressor(RandomForestRegressor(random_state=42)))
    ]
)

In [21]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, median_absolute_error, max_error
# Evaluate the model with enhanced readability, including MSE
y_pred = model_pipeline.predict(X_test)

# Metrics for Min Salary
mae_min = round(mean_absolute_error(y_test["Min_Salary"], y_pred[:, 0]), 2)
rmse_min = round(np.sqrt(mse_min), 2)
r2_min = round(r2_score(y_test["Min_Salary"], y_pred[:, 0]), 2)
mape_min = round(mean_absolute_percentage_error(y_test["Min_Salary"], y_pred[:, 0]) * 100, 2)

# Metrics for Max Salary
mae_max = round(mean_absolute_error(y_test["Max_Salary"], y_pred[:, 1]), 2)
rmse_max = round(np.sqrt(mse_max), 2)
r2_max = round(r2_score(y_test["Max_Salary"], y_pred[:, 1]), 2)
mape_max = round(mean_absolute_percentage_error(y_test["Max_Salary"], y_pred[:, 1]) * 100, 2)

# results
results = {
    "Min Salary": {
        "MAE": mae_min,
        "RMSE": rmse_min,
        "R2": r2_min,
        "MAPE": mape_min
    },
    "Max Salary": {
        "MAE": mae_max,
        "RMSE": rmse_max,
        "R2": r2_max,
        "MAPE": mape_max
    }
}

# Print results
results


{'Min Salary': {'MAE': 7793.27, 'RMSE': 16794.41, 'R2': 0.76, 'MAPE': 6.85},
 'Max Salary': {'MAE': 12910.34, 'RMSE': 26546.34, 'R2': 0.81, 'MAPE': 7.86}}

In [None]:
# Complete code for predicting and displaying salary range in a readable format

# Assuming `model_pipeline` is defined and trained
custom_input = pd.DataFrame({
    "Programming_Languages": ["python, sql"],
    "Visualization_Tools": ["tableau"],
    "Databases_and_Big_Data_Tools": ["mysql"],
    "ML_and_Data_Science_Libraries": ["scikit-learn"],
    "Cloud_Services": ["aws"],
    "Education": ["master"],
    "Experience": [5],
    "State": ["CA"],
    "job_position": ["Data Scientist"],
    "level": ["Entry-level"],
    "Work_Mode": ["onsite"]
})

# Predict salary range using the trained model
custom_prediction = model_pipeline.predict(custom_input)

# Extract and round the predicted salaries to two decimal places
predicted_min_salary = round(custom_prediction[0, 0], 2)
predicted_max_salary = round(custom_prediction[0, 1], 2)

# Display the results in a readable format
print(f"Predicted Salary Range:\n- Min Salary: ${predicted_min_salary:,.2f}\n- Max Salary: ${predicted_max_salary:,.2f}")


Predicted Salary Range:
- Min Salary: $121,812.75
- Max Salary: $189,180.21
