# **Imports**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.tree import export_graphviz
import graphviz
from mpl_toolkits.mplot3d import Axes3D
import scipy

In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od
od.download(
    "https://www.kaggle.com/datasets/mohithsairamreddy/salary-data/data")

# **Data Cleaning**

In [None]:
df_salaries_uncleaned = pd.read_csv("/content/salary-data/Salary_Data.csv")
df_salaries_cleaned = df_salaries_uncleaned.dropna()      # drop null entires
df_salaries_cleaned['Education Level'] = df_salaries_cleaned['Education Level'].replace("Bachelor's Degree", "Bachelor's")      # standardize education level categories
df_salaries_cleaned['Education Level'] = df_salaries_cleaned['Education Level'].replace("Master's Degree","Master's")
df_salaries_cleaned['Education Level'] = df_salaries_cleaned['Education Level'].replace("phD","PhD")
df_salaries_cleaned['Age'] = df_salaries_cleaned['Age'].astype('int')
df_salaries_cleaned['Years of Experience'] = df_salaries_cleaned['Years of Experience'].astype('int')
df_salaries_cleaned['Monthly Salary'] = df_salaries_cleaned['Years of Experience'].astype('int')
df_salaries_cleaned = df_salaries_cleaned.drop_duplicates()     # drop duplicate entries

df_salaries_cleaned.info()
df_salaries_cleaned.head()

# **Data Visualizations**

In [None]:
# Figue 1: Overall Data Distribution with Histograms
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 8))
df_salaries_cleaned['Age'].plot(kind='hist', ax=axes[0, 0], title='Age')
df_salaries_cleaned['Years of Experience'].plot(kind='hist', ax=axes[0, 1], title='Years of Experience')
df_salaries_cleaned['Salary'].plot(kind='hist', ax=axes[0, 2], title='Salary')
df_salaries_cleaned['Gender'].value_counts().plot(kind='bar', ax=axes[1, 0], title='Gender')
df_salaries_cleaned['Education Level'].value_counts().plot(kind='bar', ax=axes[1, 1], title='Education')
df_salaries_cleaned['Job Title'].value_counts()[:20].plot(kind='bar', ax=axes[1, 2], title='Top 20 Job Titles')
fig.suptitle('Overall Data Distribution', fontsize=16)
plt.tight_layout()

# Figure 2: Data distribution - Salaries v. one factor
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 7))
xfactor = 'Gender'
g = sns.boxplot(x=xfactor, y='Salary', data=df_salaries_cleaned, ax=axes[0, 0], order=df_salaries_cleaned.groupby(xfactor)['Salary'].median().sort_values().index)
g.set(title='Gender', xlabel=None)

xfactor = 'Education Level'
g = sns.boxplot(x=xfactor, y='Salary', data=df_salaries_cleaned, ax=axes[0, 1], order=df_salaries_cleaned.groupby(xfactor)['Salary'].median().sort_values().index)
g.set(title='Education Level', xlabel=None)

xfactor = 'Age'
plt.xticks(rotation=90)
g = sns.boxplot(x=xfactor, y='Salary', ax=axes[1, 0], data=df_salaries_cleaned)
g.set(title='Age', xlabel=None)

xfactor = 'Years of Experience'
plt.xticks(rotation=90)
g = sns.boxplot(x=xfactor, y='Salary', ax=axes[1, 1], data=df_salaries_cleaned)
g.set(title='Years of Experience', xlabel=None)

fig.suptitle('Data distribution - Salaries v. one factor', fontsize=16)
plt.tight_layout()

# Figure 3 & 4: Salary Distributions by Education Level and Years of Experience
g = sns.displot(data=df_salaries_cleaned, x='Salary', hue='Education Level',multiple='stack')
g.set(title='Salary Distribution By Education Level', xlabel=None)
plt.xticks(rotation=45)
g = sns.displot(data=df_salaries_cleaned, x='Salary', hue='Years of Experience',multiple='stack')
g.set(title='Salary Distribution By Years of Experience', xlabel=None)
p = plt.xticks(rotation=45)

# **Data Preprocessing**

In [None]:
# Standardize categorical features
categorical_columns = ['Gender', 'Education Level', 'Job Title']
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_categorical = one_hot_encoder.fit_transform(df_salaries_cleaned[categorical_columns])

encoded_categorical_columns = one_hot_encoder.get_feature_names_out(categorical_columns)

encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoded_categorical_columns)

numeric_columns = ['Age', 'Years of Experience']
target_column = ['Salary']
final_df = pd.concat([
    df_salaries_cleaned[numeric_columns].reset_index(drop=True),
    encoded_categorical_df,
    df_salaries_cleaned[target_column].reset_index(drop=True)
], axis=1)

X = final_df.drop('Salary', axis=1)
y = final_df['Salary']

# Standardize the numerical features
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Linear Regression**

### Model Training

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_linear = linear_model.predict(X_test)

### Model Evaluation

In [None]:
linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_r2 = r2_score(y_test, y_pred_linear)

print("Linear Regression MSE Score: ", linear_mse)
print("Linear Regression R^2 Score: ", linear_r2)

### Model Visualizations

In [None]:
df_salaries_cleaned_edulvl = df_salaries_cleaned
df_salaries_cleaned_edulvl['Education Level'] = df_salaries_cleaned['Education Level'].replace("High School", 1)
df_salaries_cleaned_edulvl['Education Level'] = df_salaries_cleaned['Education Level'].replace("Bachelor's", 2)
df_salaries_cleaned_edulvl['Education Level'] = df_salaries_cleaned['Education Level'].replace("Master's", 3)
df_salaries_cleaned_edulvl['Education Level'] = df_salaries_cleaned['Education Level'].replace("PhD", 4)

# Multivariate Salary Model with Education Level and Years of Experience
X = df_salaries_cleaned_edulvl[['Education Level', 'Years of Experience']].values.reshape(-1,2)
Y = df_salaries_cleaned_edulvl['Salary']

x = X[:, 0]
y = X[:, 1]
z = Y
x_pred = np.linspace(0, 4)
y_pred = np.linspace(0, 35)
xx_pred, yy_pred = np.meshgrid(x_pred, y_pred)
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T

ols = LinearRegression()
model = ols.fit(X, Y)
predicted = model.predict(model_viz)

r2 = model.score(X, Y)
plt.style.use('default')

fig = plt.figure(figsize=(12, 4))

ax1 = fig.add_subplot(131, projection='3d')
ax2 = fig.add_subplot(132, projection='3d')
ax3 = fig.add_subplot(133, projection='3d')

axes = [ax1, ax2, ax3]

for ax in axes:
    ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.01)
    ax.scatter(xx_pred.flatten(), yy_pred.flatten(), predicted, facecolor=(0,0,0,0), s=5, edgecolor='#70b3f0')
    ax.set_xlabel('Education Level', fontsize=12)
    ax.set_ylabel('Years of Experience', fontsize=12)
    ax.set_zlabel('Salary', fontsize=12)

ax1.view_init(elev=28, azim=120)
ax2.view_init(elev=4, azim=114)
ax3.view_init(elev=60, azim=165)

fig.suptitle('$R^2 = %.2f$' % r2, fontsize=20)

fig.tight_layout()

# Multivariate Salary Model with Age and Years of Experience
X = df_salaries_cleaned[['Age', 'Years of Experience']].values.reshape(-1,2)
Y = df_salaries_cleaned['Salary']

x = X[:, 0]
y = X[:, 1]
z = Y

x_pred = np.linspace(0,62)
y_pred = np.linspace(0, 35)
xx_pred, yy_pred = np.meshgrid(x_pred, y_pred)
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T

ols = LinearRegression()
model = ols.fit(X, Y)
predicted = model.predict(model_viz)

r2 = model.score(X, Y)
plt.style.use('default')

fig = plt.figure(figsize=(12, 4))

ax1 = fig.add_subplot(131, projection='3d')
ax2 = fig.add_subplot(132, projection='3d')
ax3 = fig.add_subplot(133, projection='3d')

axes = [ax1, ax2, ax3]

for ax in axes:
    ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.01)
    ax.scatter(xx_pred.flatten(), yy_pred.flatten(), predicted, facecolor=(0,0,0,0), s=5, edgecolor='#70b3f0')
    ax.set_xlabel('Age', fontsize=12)
    ax.set_ylabel('Years of Experience', fontsize=12)
    ax.set_zlabel('Salary', fontsize=12)

ax1.view_init(elev=28, azim=120)
ax2.view_init(elev=4, azim=114)
ax3.view_init(elev=60, azim=165)

fig.suptitle('$R^2 = %.2f$' % r2, fontsize=20)

fig.tight_layout()

# **K-Nearest Neighbors**

### Model Training

In [None]:
# KNN Model Training



### Model Evaluation

In [None]:
# KNN Model Evaluation

### Model Visualizations

In [None]:
# KNN Model Visualizations

# **Random Forest**

### Model Training

In [None]:
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

y_pred_rf = random_forest_model.predict(X_test)

### Model Evaluation

In [None]:
# Random Forest Model Evaluation
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

print("Random Forest MSE Score: ", rf_mse)
print("Random Forest R^2 Score: ", rf_r2)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

# Define a scorer for cross-validation based on MSE
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform cross-validation
cv_scores = cross_val_score(random_forest_model, X, y, cv=5, scoring=mse_scorer)

# Cross-validation scores
print("Random Forest Cross-Validation MSE Scores:", cv_scores)
print("Random Forest Mean CV MSE:", cv_scores.mean())

### Model Visualizations

In [None]:
# Sample Decision Tree
single_tree = random_forest_model.estimators_[0]
dot_data = export_graphviz(single_tree, out_file=None,
                           feature_names=X_train.columns,
                           filled=True, rounded=True,
                           special_characters=True, max_depth=3)

graph = graphviz.Source(dot_data)
graph.view()

In [None]:
# Get feature importances
importances = random_forest_model.feature_importances_

# Convert the importances into a DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort the DataFrame to get the most important features
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 5 most important features
top_features = feature_importance_df.head(10)

# Plotting
plt.figure(figsize=(30, 6))
plt.title('Top 10 Feature Importances Based on Random Forest Model')
plt.bar(top_features['Feature'], top_features['Importance'], color='green')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.show()