In [None]:
# Import Pandas and Matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import requests
# Load the data into a pandas dataframe
df = pd.read_csv("survey_results_public.csv")

In [None]:
# Confirm the dataframe has the correct number of rows and columns
df.head

In [None]:
#  Select the columns we want to use
df = df[['Country', 'EdLevel', 'YearsCodePro', 'Employment', 'ConvertedCompYearly']]
df = df.rename({'ConvertedCompYearly':'Salary'}, axis=1)
df.head

In [None]:
df['Country'] = df['Country'].replace('United States of America', 'USA')
df['Country'] = df['Country'].replace('United Kingdom of Great Britain and Northern Ireland', 'UK')
df['Country'] = df['Country'].replace('Russian Federation', 'Russia')

In [None]:
# Drop all Null Salary values
df = df[df["Salary"].notnull()]
df.head()

In [None]:
# Check Info
df.info()

In [None]:
# Drop rows where any value is NAN
df = df.dropna()
# Count all null values
df.isnull().sum()

In [None]:
# Drop all part-timers
df = df[df['Employment'] == 'Employed full-time']
df = df.drop('Employment', axis=1)
df.info()


In [None]:
# Count data points from each country
df['Country'].value_counts()

In [None]:
# Function to remove low count data points
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] =  categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [None]:
country_map = shorten_categories(df.Country.value_counts(), 600)
df['Country'] = df['Country'].map(country_map)
df['Country'].value_counts()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,10))
im = plt.imread('logo.png')
df.boxplot('Salary', 'Country', ax=ax)
plt.suptitle('Salary (US$) V Country')
plt.title('')
plt.ylabel('Salary (US$)')
plt.xlabel('Country')
plt.xticks(rotation=90)
newax = fig.add_axes([0.8,0.8,0.2,0.2], anchor='NE', zorder=1)
newax.imshow(im)
newax.axis('off')
plt.show()

In [None]:
df = df[df["Salary"] <= 250000]
df = df[df["Salary"] >= 10000]
df = df[df['Country'] != 'Other']

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,10))
im = plt.imread('logo.png')
df.boxplot('Salary', 'Country', ax=ax)
plt.suptitle('Salary (US$) V Country')
plt.title('')
plt.ylabel('Salary (US$)')
plt.xlabel('Country')
plt.xticks(rotation=90)
newax = fig.add_axes([0.8,0.8,0.2,0.2], anchor='NE', zorder=1)
newax.imshow(im)
newax.axis('off')
plt.show()

In [None]:
df["YearsCodePro"].unique()

In [None]:
def clean_experience(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_experience)

In [None]:
df["EdLevel"].unique()

In [None]:

def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [None]:
df["EdLevel"].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
le_education = LabelEncoder()
df['EdLevel'] = le_education.fit_transform(df['EdLevel'])
df["EdLevel"].unique()
#le.classes_

In [None]:

le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])
df["Country"].unique()

In [None]:
X = df.drop("Salary", axis=1)
y = df["Salary"]

In [None]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)

In [None]:
y_pred = linear_reg.predict(X)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y.values)

In [None]:
y_pred = dec_tree_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X, y.values)

In [None]:
y_pred = random_forest_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X, y.values)

In [None]:
regressor = gs.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
X

In [None]:
# country, edlevel, yearscode
X = np.array([["USA", 'Master’s degree', 15 ]])
X

In [None]:
X[:, 0] = le_country.transform(X[:,0])
X[:, 1] = le_education.transform(X[:,1])
X = X.astype(float)
X

In [None]:
y_pred = regressor.predict(X)
y_pred

In [None]:
import pickle

In [None]:
data = {"model": regressor, "le_country": le_country, "le_education": le_education}
with open('model.pkl', 'wb') as file:
    pickle.dump(data, file)

In [None]:
with open('model.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_country = data["le_country"]
le_education = data["le_education"]

In [None]:
y_pred = regressor_loaded.predict(X)
y_pred