1. clean data
2. one hot encode categorical features
3. apply scaler to numerical features
4. create model
5. hyperparameter tune the model using exhaustive search with metric of mse using cross validation
6. evaluate final model using cross validation (mse)

# **Imports**

In [151]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from itertools import combinations

# **Data Cleaning**

In [152]:
df_salaries_uncleaned = pd.read_csv("data/Salary_Data.csv")

# drop null entries
df_salaries_cleaned = df_salaries_uncleaned.dropna()

# standardize the degree names
df_salaries_cleaned["Education Level"] = df_salaries_cleaned["Education Level"].replace("Bachelor's Degree", "Bachelor's")
df_salaries_cleaned["Education Level"] = df_salaries_cleaned["Education Level"].replace("Master's Degree", "Master's")
df_salaries_cleaned["Education Level"] = df_salaries_cleaned["Education Level"].replace("phD", "PhD")

# normalize data types
df_salaries_cleaned["Age"] = df_salaries_cleaned["Age"].astype("int")
df_salaries_cleaned["Years of Experience"] = df_salaries_cleaned["Years of Experience"].astype("int")

# drop duplicate entries
df_salaries_cleaned = df_salaries_cleaned.drop_duplicates()

# show data summary
df_salaries_cleaned.info()
df_salaries_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1786 entries, 0 to 6631
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1786 non-null   int32  
 1   Gender               1786 non-null   object 
 2   Education Level      1786 non-null   object 
 3   Job Title            1786 non-null   object 
 4   Years of Experience  1786 non-null   int32  
 5   Salary               1786 non-null   float64
dtypes: float64(1), int32(2), object(3)
memory usage: 83.7+ KB


Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32,Male,Bachelor's,Software Engineer,5,90000.0
1,28,Female,Master's,Data Analyst,3,65000.0
2,45,Male,PhD,Senior Manager,15,150000.0
3,36,Female,Bachelor's,Sales Associate,7,60000.0
4,52,Male,Master's,Director,20,200000.0


# **Data Preprocessing**

In [153]:
# separate categorical and numerical columns
categorical_columns = ["Gender", "Education Level", "Job Title"]
numerical_columns = ["Age", "Years of Experience"]

# # create training data
# X_train = df_salaries_cleaned[["Age", "Gender", "Education Level", "Job Title", "Years of Experience"]]
# y_train = df_salaries_cleaned["Salary"]

X_data = df_salaries_cleaned[["Age", "Gender", "Education Level", "Job Title", "Years of Experience"]]
y_data = df_salaries_cleaned["Salary"]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

In [163]:
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"), categorical_columns),
    (StandardScaler(), numerical_columns),
    remainder="drop"
)

pipeline = make_pipeline(
    ct,
    LinearRegression()
)

pipeline.fit(X=X_train, y=y_train)

y_pred = pipeline.predict(X=X_test)

print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

import math
print(math.sqrt(mean_squared_error(y_test, y_pred)))

503620783.2494343
0.8231839739213134
22441.496903046245


# **K-Nearest Neighbors**

In [166]:
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"), categorical_columns),
    (StandardScaler(), numerical_columns),
    remainder="drop"
)

pipeline = make_pipeline(
    ct,
    KNeighborsRegressor(n_neighbors=10, metric="euclidean")
)

pipeline.fit(X=X_train, y=y_train)

# X_test = pd.DataFrame({"Age": [32],
#                        "Gender": ["Male"],
#                        "Education Level": ["Master's"],
#                        "Job Title": ["Director"],
#                        "Years of Experience": [20]})

y_pred = pipeline.predict(X=X_test)

print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

import math
print(math.sqrt(mean_squared_error(y_test, y_pred)))

mse = -cross_val_score(pipeline, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
mse.mean()

cross_val_score(pipeline, X_data, y_data, cv=10, scoring="r2").mean()

426925151.18452513
0.8501110137305379
20662.16714636984


0.6351402609695793

In [156]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neighbors import KNeighborsRegressor

ct = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"), categorical_columns),
    (StandardScaler(), numerical_columns),
    remainder='drop'  # You can also specify 'passthrough' if you want to keep the remaining columns
)

# Create a Pipeline using make_pipeline with ColumnTransformer, SelectKBest, and KNeighborsRegressor
pipeline = make_pipeline(
    ct,
    SelectKBest(score_func=f_regression),
    KNeighborsRegressor(metric="euclidean")
)

# Define the hyperparameters grid
param_grid = {
    'selectkbest__k': [1, 2, 3, 4, 5],  # Number of features to select
    'kneighborsregressor__n_neighbors': [n for n in range(1, 30)]
}

# Initialize GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')

# Perform grid search to find the best hyperparameters
grid_search.fit(X_data, y_data)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
mse = -grid_search.best_score_
print("Best Mean Squared Error (MSE):", mse)

best_model.fit(X=X_train, y=y_train)
y_pred = best_model.predict(X=X_test)

print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

best_model

Best Hyperparameters: {'kneighborsregressor__n_neighbors': 27, 'selectkbest__k': 2}
Best Mean Squared Error (MSE): -0.6141897761228013
611354878.4996283
0.7853596520726114
