In [28]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

In [147]:
# https://www.kaggle.com/ludobenistant/hr-analytics
df = pd.read_csv("./data/hr.csv")
df.head(3)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium


In [137]:
# Tanslate salary into a scale from 1-3 (1 = low, 3 = high)
df.replace({"salary" : {"low": 1, "medium": 2, "high" : 3 }}, inplace=True)

# Create new features
df["_statisfaction_evaluation"] = df["satisfaction_level"] * df["last_evaluation"]
df["_salary_satisfaction"] = df["salary"] * df["satisfaction_level"]
df["_work_accident_2"] = df["Work_accident"] ** 2
df["_work_accident_3"] = df["Work_accident"] ** 3

# Drop text features
del df["sales"]

In [138]:
# Create our train and test data
trainsize = int(df.shape[0] / 2)
df_train = df.iloc[:trainsize, :].copy()
df_test = df.iloc[trainsize:, :].copy()

# Create our Xs and Ys
Y_train = df_train["left"].values.reshape(-1, 1)
X_train = df_train.loc[:, ~(df_train.columns).isin(["left"])]

Y_test = df_test["left"].values.reshape(-1, 1)
X_test = df_test.loc[:, ~(df_train.columns).isin(["left"])]

In [139]:
# Take a look at our new dataframe
df.head(3)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,_statisfaction_evaluation,_salary_satisfaction,_work_accident_2,_work_accident_3
0,0.38,0.53,2,157,3,0,1,0,1,0.2014,0.38,0,0
1,0.8,0.86,5,262,6,0,1,0,2,0.688,1.6,0,0
2,0.11,0.88,7,272,4,0,1,0,2,0.0968,0.22,0,0


In [144]:
# Vanilla regression
vanilla = linear_model.LinearRegression()
vanillafit = vanilla.fit(X_train, Y_train)

# See the results for the training set
print('R-squared for vanilla regression:', vanilla.score(X_train, Y_train))
print('\nParameter estimates for vanilla regression:')
print(np.append(vanillafit.coef_, vanillafit.intercept_))

# See the results for the test set
print('R-squared for vanilla regression (test):', vanilla.score(X_test, Y_test))

R-squared for vanilla regression: 0.287407914726

Parameter estimates for vanilla regression:
[ -2.04922669e+00  -9.18973591e-01  -2.86864997e-02   6.63591725e-04
   1.05836728e-01  -4.85796725e-02  -7.43014813e-02  -1.77616812e-01
   1.54241385e+00   1.66669587e-01  -4.85796725e-02  -4.85796725e-02
   1.25788445e+00]
R-squared for vanilla regression (test): 0.011000873772


In [145]:
# Ridge regression
ridg = linear_model.Ridge(alpha=10, fit_intercept=False)
ridgfit = ridg.fit(X_train, Y_train)

# See the results for the training set
print('R-squared for ridge:', ridg.score(X_train, Y_train))
print('\nParameter estimates for ridge:')
print(np.append(ridgfit.coef_, ridgfit.intercept_))

# See the results for the test set
print('R-squared for ridge (test):', ridg.score(X_test, Y_test))

R-squared for ridge: 0.246170464459

Parameter estimates for ridge:
[-0.2315697   0.22917024 -0.04220256  0.000784    0.12673408 -0.04733489
 -0.06520805  0.02697761 -0.18224349 -0.12584151 -0.04733489 -0.04733489
  0.        ]
R-squared for ridge (test): -0.117064911806


In [146]:
# Lasson regression
lass = linear_model.Lasso(alpha=0.35)
lassfit = lass.fit(X_train, Y_train)

# See the results for the training set
print('R-squared for lasso:', lass.score(X_train, Y_train))
print('\nParameter estimates for lasso:')
print(np.append(lassfit.coef_, lassfit.intercept_))

# See the results for the test set
print('R-squared for lasso (test):', lass.score(X_test, Y_test))

R-squared for lasso: 0.00591336128193

Parameter estimates for lasso:
[-0.         -0.          0.          0.00055004  0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.          0.15595495]
R-squared for lasso (test): -0.0156230999061


## Questions

- What does a negative r-squared mean?
- It looks like the best model is vanilla regression, is that correct?
- All 3 models perform really poorly. What other features should we build?



In [163]:
# Should we try this with a different dataset?
# http://share.mailcharts.com/3g1g1K1y3O1Q

# https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/tables/table-8/table-8-state-cuts/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls
col_names = ["city", "population", "violent crime", "murder", "rape 1", "rape 2", "robbery", "assault", "property crime", "burglary", "larceny", "motor theft", "arson"]
df = pd.read_csv("./data/ny_crime.csv", names=col_names, header=0)
df.head(3)

Unnamed: 0,city,population,violent crime,murder,rape 1,rape 2,robbery,assault,property crime,burglary,larceny,motor theft,arson
0,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0,,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0,,0,0,3,16,1,15,0,0.0
