In [72]:
import numpy as np
import pandas as pd
import scipy.linalg as LA
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.image as mpimg
import sklearn
from PIL import Image
from scipy.misc import imread
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.linear_model import Ridge, LassoCV, Lasso
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

In [73]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

all_data = pd.concat((train.loc[:,'RESOURCE':'ROLE_CODE'], test.loc[:,'RESOURCE':'ROLE_CODE']))

train.head()

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [74]:
#matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
#prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
#prices.hist()
#log transform the target:
train["Action"] = np.log1p(train["ACTION"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
                               
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)

#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

#creating matrices for sklearn:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.ACTION

In [75]:
X_train.head()

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,10.580353,11.355991,117961,118300,123472,11.677643,11.677651,290919,11.677668
1,9.751734,7.340187,117961,118343,123125,11.68298,11.68298,308574,11.683006
2,10.511213,9.579003,118219,118220,117884,11.677422,12.498567,19721,11.677431
3,10.495045,8.593599,117961,118343,119993,11.681165,12.392486,290919,11.681173
4,10.661509,8.683724,117929,117930,119569,11.689598,11.727496,19793,11.689615


In [76]:
X_test.head()

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,11.274249,11.194578,118079,118080,117878,11.677422,11.679947,19721,11.677431
1,10.612631,8.384576,117961,118327,118507,11.685735,11.71185,118398,11.685752
2,11.231146,7.781556,117961,118300,119488,11.679905,12.616641,249618,11.67993
3,10.674059,9.902837,117961,118225,118403,11.701676,11.821792,118960,11.701685
4,10.64766,10.820098,117961,118343,119598,11.682018,12.611994,118424,11.682044


In [77]:
y.shape

(32769,)

We've tried several types of LogisticRegression Solvers. Namely, lbfgs, sag, newton-cg

In [78]:
log = LogisticRegression(solver='lbfgs')
log = log.fit(X_train,y)

predictions = log.predict(X_test)

results = pd.DataFrame({"Id":test.id, "Action":predictions})
results.to_csv("LogisticResults.csv", index = False)

In [79]:
results

Unnamed: 0,Action,Id
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
5,1,6
6,1,7
7,1,8
8,1,9
9,1,10


In [80]:
log.score(X_train, y)

0.94210992096188473

Our attempt at one hot encoding:

In [81]:
encoder = sklearn.preprocessing.OneHotEncoder()
label_encoder = sklearn.preprocessing.LabelEncoder()
data_label_encoded = label_encoder.fit_transform(train['Action'])
train['Action'] = data_label_encoded
data_feature_one_hot_encoded = encoder.fit_transform(train[['Action']].as_matrix())

#data_feature_one_hot_encoded[:,1]

In [82]:
model_xgb = xgb.XGBRegressor(n_estimators=10000, max_depth=20, learning_rate=0.2)
model_xgb.fit(X_train, y)

xgb_preds = model_xgb.predict(X_test)

results = pd.DataFrame({"id":test.id})
results["ACTION"]=xgb_preds
results.to_csv("XGBresults2.csv", index = False)

In [83]:
train

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE,Action
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908,1
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539,1
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880,1
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322,1
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325,1
5,0,45333,14561,117951,117952,118008,118568,118568,19721,118570,0
6,1,25993,17227,117961,118343,123476,118980,301534,118295,118982,1
7,1,19666,4209,117961,117969,118910,126820,269034,118638,126822,1
8,1,31246,783,117961,118413,120584,128230,302830,4673,128231,1
9,1,78766,56683,118079,118080,117878,117879,304519,19721,117880,1


In [84]:
results

Unnamed: 0,id,ACTION
0,1,1.002400
1,2,0.985969
2,3,1.001093
3,4,0.998022
4,5,1.006230
5,6,0.999932
6,7,0.867902
7,8,0.999882
8,9,0.835284
9,10,1.003176


In [85]:
model_xgb.score(X_train, y)

0.99993627361295767

XGB Best submission score so far: .8284