# Salary Predictions Based on Job Descriptions

# Part 1 - DEFINE

### ---- 1 Define the problem ----

Write the problem in your own words here

In [4]:
#import your libraries
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder

#etc
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#your info here
__author__ = "Alex EBE"
__email__ = "alexauguste01@gmail.com"

ImportError: cannot import name '_safe_indexing'

## Part 2 - DISCOVER

### ---- 2 Load the data ----

In [None]:
#load the data into a Pandas dataframe
features = pd.read_csv('data/train_features.csv')
salaries = pd.read_csv('data/train_salaries.csv')

In [None]:
features.head(20)

In [None]:
features.shape

In [None]:
salaries.head(10)

In [None]:
salaries.info()

In [None]:
features.info()

The dataset has :

- 2 numerical variables - `yearsExperience` and `milesFromMetropolis`;
- 6 categorical variables - `jobId`, `companyId`, `jobType`, `degree`, `major` and `industry`
- No missing values

### ---- 3 Clean the data ----

#### Removing duplicate

In [None]:
#look for duplicate data, invalid data (e.g. salaries <=0), or corrupt data and remove it
duplicate = features.duplicated(subset=['jobId'])
duplicated = duplicate[duplicate == True]
duplicated

#### Removing invalid data

In [None]:
#Invalid data
invalid_salaries = salaries.loc[salaries['salary']<=0,:]
invalid_job_offers = features.loc[invalid_salaries.index,:]
invalid_salaries #invalid data in outcome dataset

In [None]:
invalid_job_offers #invalid data in feature dataset

In [None]:
salaries.drop(invalid_salaries.index, inplace=True)
features.drop(invalid_salaries.index, inplace=True)

#### Reset indexing

In [None]:
salaries.reset_index(inplace=True, drop=True)
features.reset_index(inplace=True, drop=True)

#### Remove irrelevant features

In [None]:
features.drop(['jobId', 'companyId'], axis=1, inplace=True)
salaries.drop(['jobId'], axis=1, inplace=True)

In [None]:
features.info()

In [None]:
numerical_features = ['yearsExperience', 'milesFromMetropolis']
categorical_features = ['jobType', 'degree', 'major', 'industry']

### ---- 4 Take a quick look at the data structure ----

In [None]:
%%javascript
    IPython.OutputArea.auto_scroll_threshold = 9999

In [None]:
n = len(categorical_features)
plt.figure(figsize=(15,10))
#fig, axes =plt.subplots(len(categorical_features), 1)
for idx, feature in enumerate(categorical_features):
    plt.subplot(2, 2, idx + 1)
    sns.countplot(x=categorical_features[idx], data=features, palette='rainbow')
plt.show()

In [None]:
plt.figure(figsize=(20,10))
features.hist(bins=50, figsize=(20,10))

- **The distributions of years experience(`yearsExperience`) and miles form metropolis(`milesFromMetropolis`) are uniform**
- **The dataset is balanced regarding almost all features except `major` (more that 50% of the population has no major) and `degree` (`HIGH_SCHOOL` and `NONE` are the modes), but this reflects the reality that [around 50% of the US population attend to college](https://www.census.gov/newsroom/press-releases/2020/educational-attainment.html)** 

### ---- 6 Create a  test set ----

Before going further in data analysis, let's set aside part of the data. Although this decision seems hasty, it is useful to prevent us from data snooping bias.

We just pick some instances randomly, typically 20% of the dataset. Although the dataset is strongly imbalanced regarding the `major` feature, the population size prevents us from introducing sampling bias.

In [None]:
features_train, features_test, salaries_train, salaries_test = train_test_split(features, salaries, test_size=0.2, random_state=50)

### ---- 7 Explore the data (EDA) ----

In [None]:
#summarize each feature variable
#summarize the target variable
#look for correlation between each feature and the target
#look for correlation between features

In [None]:
features_train.describe(include='all')

In [None]:
salaries_train.describe(include='all')

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize =(20,5))
ax1.scatter(features_train['yearsExperience'], salaries_train['salary'])
ax1.set_title('Salary and yearsExperience')
ax2.scatter(features_train['milesFromMetropolis'], salaries_train['salary'])
ax2.set_title('Salary and milesFromMetropolis')
plt.show()

##### Possible linear relationship between `salary` and `yearsExperience`

#### Distribution of salary

In [None]:
sns.distplot(salaries_train['salary'], bins=60)
plt.vlines([116, 114, 88, 141], colors=['r', 'k', 'g', 'y'], ymin=0, ymax=0.01)
#plt.vlines(114, colors='k', ymin=0, ymax=0.01)

##### The `salary` variable follows a normal distribution

In [None]:
corr_matrix = features_train.corr()
corr_matrix

##### There's no linear correlation between the numerical features

#### Variability of salary by `jobType`, `degree`, `major` and `industry`

In [None]:
#f, ax = plt.subplots(2, 2, sharey=True)
plt.figure(figsize=(15,10))
for i in range(4):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x=features_train[categorical_features[i]], y=salaries_train['salary'], palette='rainbow')

### ---- 9 Establish a baseline ----

#### Metric

We use MSE as metric

#### Baseline

Let's create a simple model which for a given job offer, use the average salary of the feature **jobType** as prediction. We make this choice because **jobType** feature shows more variability for the target feature.

In [None]:
def set_predictor():
    avg = {}
    for jobType in features_train['jobType'].unique():
        idx = features_train.loc[features_train['jobType']==jobType].index
        avg[jobType] = salaries_train.loc[idx,'salary'].mean() 
    return avg

In [None]:
set_predictor()

In [None]:
def baseline_predict(X):
    avg_salary_by_jobType = set_predictor()
    return X['jobType'].map(avg_salary_by_jobType)

In [None]:
baseline_salaries_pred = baseline_predict(features_test)

In [None]:
baseline_salaries_pred

#### Baseline performance measure

In [None]:
baseline_score = sk.metrics.mean_squared_error(salaries_test['salary'], baseline_salaries_pred)

In [None]:
baseline_score

**Baseline MSE : 962.47** 

### ---- 10 Hypothesize solution ----

***We propose 3 models to improve the baseline model MSE:***
- ***Ridge regression, because there is a linear relationship between the features and the target ;***
- ***Voting ensemble by combining multiple linear regression;***
- ***Random forest*** 



***The categorical features will be changed to dummy variables***

## Part 3 - DEVELOP

You will cycle through creating features, tuning models, and training/validing models (steps 7-9) until you've reached your efficacy goal

#### Your metric will be MSE and your goal is:
 - <360 for entry-level data science roles
 - <320 for senior data science roles

### ---- 11 Engineer features  ----

***This is feature engineering steps we take:***

- ***Standardizing numerical features;***

- ***One hot encoding categorical features.***

In [None]:
full_pipeline = ColumnTransformer([
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(sparse=False, drop='first'), categorical_features),
    ])

In [None]:
transformed_features_train = full_pipeline.fit_transform(features_train)
transformed_features_train.shape

### ---- 12 Create models ----

Ridge Regression

In [69]:
ridge_parameters =  {"alpha": [1e-8, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 1e-2, 1e-1, 1.0, 1e2, 1e3, 1e4, 5e4, 1e5, 6e5, 1e6]}
ridge_grid_search_cv = GridSearchCV(Ridge(), ridge_parameters, n_jobs=-1, verbose=2, cv=5)
ridge_grid_search_cv.fit(transformed_features_train, salaries_train)

Fitting 5 folds for each of 17 candidates, totalling 85 fits


GridSearchCV(cv=5, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [1e-08, 1e-06, 1e-05, 5e-05, 0.0001, 0.0005,
                                   0.001, 0.01, 0.1, 1.0, 100.0, 1000.0,
                                   10000.0, 50000.0, 100000.0, 600000.0,
                                   1000000.0]},
             verbose=2)

Voting Ensemble

In [36]:
rf_parameters =  {"max_depth": [20, 23, 25], "max_features":[20, 23, 25, 27],
                            "max_leaf_nodes":[3000, 3500, 4000], }
rf_grid_search_cv = GridSearchCV(RandomForestRegressor(random_state=42), rf_parameters, n_jobs=-1, 
                                            verbose=5, cv=5, scoring='neg_mean_squared_error', refit=True)
rf_grid_search_cv.fit(transformed_features_train, salaries_train['salary'].values)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [20, 23, 25],
                         'max_features': [20, 23, 25, 27],
                         'max_leaf_nodes': [3000, 3500, 4000]},
             scoring='neg_mean_squared_error', verbose=5)

In [37]:
rf_grid_search_cv.best_estimator_

RandomForestRegressor(max_depth=20, max_features=20, max_leaf_nodes=4000,
                      random_state=42)

In [39]:
rf_grid_search_cv.best_score_

-375.2144778796933

In [143]:
ridge_grid_search_cv.best_estimator_

Ridge()

In [40]:
decision_tree_grid_search_cv.best_estimator_

DecisionTreeRegressor(max_depth=10, max_features=15, max_leaf_nodes=35,
                      random_state=42)

In [43]:
decision_tree_grid_search_cv.best_score_

0.555103005189002

Random Forest

### ---- 13 Test models ----

In [41]:
transformed_features_test = full_pipeline.fit_transform(features_test)

Ridge regression

In [137]:
ridge_salaries_pred = ridge_grid_search_cv.predict(transformed_features_test)

In [138]:
ridge_score = sk.metrics.mean_squared_error(salaries_test['salary'], ridge_salaries_pred)

In [139]:
ridge_score

383.0554213546274

Decision Tree

In [44]:
decision_tree_salaries_pred = decision_tree_grid_search_cv.predict(transformed_features_test)

In [45]:
decision_tree_score = sk.metrics.mean_squared_error(salaries_test['salary'], decision_tree_salaries_pred)

In [46]:
decision_tree_score

659.9739226097992

In [1]:
#do 5-fold cross validation on models and measure MSE

### ---- 10 Select best model  ----

In [None]:
#select the model with the lowest error as your "prodcuction" model

## Part 4 - DEPLOY

### ---- 11 Automate pipeline ----

In [None]:
#write script that trains model on entire training set, saves model to disk,
#and scores the "test" dataset

### ---- 12 Deploy solution ----

In [16]:
#save your prediction to a csv file or optionally save them as a table in a SQL database
#additionally, you want to save a visualization and summary of your prediction and feature importances
#these visualizations and summaries will be extremely useful to business stakeholders

### ---- 13 Measure efficacy ----

We'll skip this step since we don't have the outcomes for the test data