<a href="https://www.kaggle.com/code/aminshaker/stackoverflow-2022-salary-prediction?scriptVersionId=180931348" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 1. Import necessary packages

In [None]:
import joblib
import warnings
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt

from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
pio.renderers.default = 'notebook'

warnings.filterwarnings('ignore')

# 2. Import dataset

In [None]:
df = pd.read_csv('/kaggle/input/stackoverflow-developer-suvery-2022/survey_results_public.csv')
df.sample(1)

# 3. EDA

In [None]:
df.info(memory_usage='deep')

In [None]:
df.columns

# 4. Visualization


In [None]:
def plot_bar_chart(df, column, line=""):
    if line == "":
        line = df[column].value_counts().keys()[:12]

    data = df[column].value_counts()[0:12]

    fig = px.bar(x=line, y=data)
    fig.show()

def plot_pie_chart(df, column, line=""):
    if line == "":
        line = df[column].value_counts().keys()[:12]

    data = df[column].value_counts()[0:12]

    fig = px.pie(names=line, values=data)
    fig.show()

In [None]:
plot_bar_chart(df, 'Country')

In [None]:
line = ['developer', 'student', 'half-developer', 'hobby', 'used-developer', 'neither']

plot_bar_chart(df, 'MainBranch', line)
plot_pie_chart(df, 'MainBranch', line)

In [None]:
plot_bar_chart(df, 'Employment')
plot_pie_chart(df, 'Employment')

In [None]:
plot_bar_chart(df, 'EdLevel')
plot_pie_chart(df, 'EdLevel')

In [None]:
df.DevType

In [None]:
type_list = []

for item in df.DevType:
    if not pd.isnull(item):
        data = item.split(";")
        
        for i in data:
            type_list.append(i)

dev_type_df = pd.DataFrame(type_list, columns=['DevType'])
dev_type_df.value_counts()

In [None]:
plot_bar_chart(dev_type_df, 'DevType')
plot_pie_chart(dev_type_df, 'DevType')

# 5. Feature engineering

In [None]:
df = df[['Country', 'EdLevel', 'YearsCodePro', 'ConvertedCompYearly']]
df.rename({'ConvertedCompYearly' : 'Salary'}, axis=1, inplace=True)
df.sample(3)

In [None]:
df.info(memory_usage='deep')

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.Country.value_counts()

In [None]:
def shorten_categories(categories, cutoff):
    categorical_map = {}

    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    
    return categorical_map

In [None]:
country_map = shorten_categories(df.Country.value_counts(), 199)
df.Country = df.Country.map(country_map)
df.Country.value_counts()

In [None]:
fig = px.box(df, x='Country', y='Salary')
fig.show()

In [None]:
# Remove outliers

df = df[(df.Salary <= 200000) & (df.Salary >= 10000) & (df.Country != 'Other')]
df.info()

In [None]:
df.sample(5)

In [None]:
px.box(df, x='Country', y='Salary')

In [None]:
df.YearsCodePro.unique()

In [None]:
def clean_experience(x):
    if x == 'More than 50 years':
        return 50
    
    if x == 'Less than 1 year':
        return 0.5
    
    return float(x)

df.YearsCodePro = df.YearsCodePro.apply(clean_experience)
df.YearsCodePro.unique

In [None]:
df.EdLevel.unique()

In [None]:
def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    
    if 'Master’s degree' in x:
        return 'Master’s degree'
    
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    
    return 'Less than a Bachelors'

df.EdLevel = df.EdLevel.apply(clean_education)
df.EdLevel.unique()

In [None]:
df.sample(5)

In [None]:
df.info()

# 6. Model Traning

In [None]:
X = df.drop('Salary', axis=1)
y = df.Salary

X.head(5)

In [None]:
numerical_pipeline = Pipeline([('Scaler', StandardScaler())])
categorical_pipeline = Pipeline([('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))])

transformer = ColumnTransformer([
    ('numerical', numerical_pipeline, ['YearsCodePro']),
    ('categorical', categorical_pipeline, ['EdLevel', 'Country'])
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1234)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
ml_pipeline_1 = Pipeline([('transformer', transformer),('lr', LinearRegression())])

ml_pipeline_1.fit(X_train, y_train)
y_prediction = ml_pipeline_1.predict(X_test)

error = np.sqrt(mean_squared_error(y_test, y_prediction))

print("${:,.02f}".format(error))

In [None]:
ml_pipeline_2 = Pipeline([('transformer', transformer),('dt', DecisionTreeRegressor())])

ml_pipeline_2.fit(X_train, y_train)
y_prediction = ml_pipeline_2.predict(X_test)

error = np.sqrt(mean_squared_error(y_test, y_prediction))

print("${:,.02f}".format(error))

In [None]:
ml_pipeline_3 = Pipeline([('transformer', transformer),('rf', RandomForestRegressor())])

ml_pipeline_3.fit(X_train, y_train)
y_prediction = ml_pipeline_3.predict(X_test)

error = np.sqrt(mean_squared_error(y_test, y_prediction))

print("${:,.02f}".format(error))

In [None]:
ml_pipeline_4 = Pipeline([('transformer', transformer),('xgb', XGBRegressor())])

ml_pipeline_4.fit(X_train, y_train)
y_prediction = ml_pipeline_4.predict(X_test)

error = np.sqrt(mean_squared_error(y_test, y_prediction))

print("${:,.02f}".format(error))

In [None]:
df.EdLevel.value_counts().sort_values()

In [None]:
X_new = np.array([['Iran, Islamic Republic of...', 'Master’s degree', 20]])
columns = ['Country', 'EdLevel', 'YearsCodePro']

new_df = pd.DataFrame(X_new, columns=columns)
new_df

In [None]:
ml_pipeline_4.predict(new_df)

# 7. Save model

In [None]:
joblib.dump(ml_pipeline_4, 'xgb.joblib')