In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('data_eda.csv')

### Choose Relevant Models

In [None]:
df.columns

In [None]:
df_model = df[['avg_salary', 'Rating', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'num_comp', 'hourly', 'employer_provided', 'job_state', 'same_state', 'age', 'python', 'aws', 'excel', 'spark', 'excel', 'job_simp', 'seniority', 'desc_len']]

### Get Dummy Data Variables

In [None]:
df_dum = pd.get_dummies(df_model)

### Train, Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = df_dum.drop('avg_salary', axis = 1)
y = df_dum.avg_salary.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
list(X_test.iloc[1, :])

### Multi-linear Regression

In [None]:
import statsmodels.api as sm
X_sm = X =  sm.add_constant(X)
model = sm.OLS(y.astype(float), X.astype(float))
model.fit().summary()

In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
cross_val_score(lm, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 3)

### Lasso Regression

In [None]:
lml = Lasso(alpha = 0.13)
lml.fit(X_train, y_train)

In [None]:
cross_val_score(lml, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 3)

In [None]:
alpha = []
error = []

for i in range(1, 100):
    alpha.append(i/100)
    lml = Lasso(alpha = (i/100))
    error.append(np.mean(cross_val_score(lml, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 3)))

plt.plot(alpha, error)

In [None]:
error = tuple(zip(alpha, error))

In [None]:
df_err = pd.DataFrame(error, columns = ['alpha', 'error'])

In [None]:
df_err[df_err.error == max(df_err.error)]

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
np.mean(cross_val_score(rf, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 3))

### Hyperparameter Tuning: GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': range(10, 300, 10), 'criterion': ("squared_error", "absolute_error"), 'max_features': ('auto', 'sqrt', 'log2')}

In [None]:
gs = GridSearchCV(rf, parameters, scoring = 'neg_mean_absolute_error', cv = 3)
gs.fit(X_train, y_train)

In [None]:
gs.best_score_
gs.best_estimator_

### Test Ensembles

In [None]:
tpred_lm = lm.predict(X_test)
tpred_lml = lml.predict(X_test)
tpred_rf = gs.best_estimator_.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, tpred_lm)

In [None]:
mean_absolute_error(y_test, tpred_lml)

In [None]:
mean_absolute_error(y_test, tpred_rf)

In [None]:
mean_absolute_error(y_test, (tpred_lml + tpred_rf)/2)

In [None]:
import pickle

In [None]:
pickl = {'model': gs.best_estimator_}
pickle.dump( pickl, open( 'model_file' + ".p", "wb" ))

In [None]:
file_name = "model_file.p"
with open(file_name, 'rb') as pickled:
    data = pickle.load(pickled)
    model = data['model']

In [None]:
model.predict(X_test.iloc[1, :].values.reshape(1, -1))