In [33]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


In [None]:

df = pd.read_csv("../cleaned_train.csv")
df = df.drop(columns = ['Unnamed: 0'])

In [2]:
X_train, Locked_X, y_train, Locked_y = train_test_split(df.drop(columns = ['totals.transactionRevenue', 'class_pred', 'fullVisitorId']), df[['totals.transactionRevenue', 'class_pred', 'fullVisitorId']], test_size=0.33, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, shuffle = False)
print("X_train: " + str(X_train.shape))
print("X_test: " + str(X_test.shape))
print("y_train: " + str(y_train.shape))
print("y_test: " + str(y_test.shape))
print("Locked_X: " + str(Locked_X.shape))
print("Locked_y: " + str(Locked_y.shape))


print("sum locked: ", sum(Locked_y['class_pred']))
print("sum train: ", sum(y_train['class_pred']))
print("sum test: ", sum(y_test['class_pred']))

X_train: (766871, 57)
X_test: (377714, 57)
y_train: (766871, 3)
y_test: (377714, 3)
Locked_X: (563752, 57)
Locked_y: (563752, 3)
('sum locked: ', 6247.0)
('sum train: ', 8406.0)
('sum test: ', 3861.0)


# Fit PCA

In [3]:
#PCA keeping 95% of variance
pca = PCA(.95)

pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(pca.singular_values_)  



[1552.81098771 1200.75632511 1031.19128675  873.06689032  849.09972136
  555.84804885  546.88567088  528.94351139  495.55638516  450.3575826
  439.24008569  425.66397314  401.99561355  381.43843277  372.28267389
  365.01160136  356.12259497  348.600143    325.31897493  310.82613346
  263.56327102]


# Fit Random Forest

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [25]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf = .1, random_state=0, class_weight='balanced')
# rf.fit(X_train, y_train['class_pred'])
# y_pred = rf.predict(X_test)
print(cross_val_score(rf, X_train, y_train['class_pred'], cv=3))

rf.fit(X_train, y_train['class_pred'])
y_pred = rf.predict(X_test)
print("Confusion matrix no PCA: ", confusion_matrix(y_test['class_pred'], y_pred))
print("f1-score no PCA: ", f1_score(y_test['class_pred'], y_pred))

[0.88504601 0.88452571 0.86840777]
('Confusion matrix no PCA: ', array([[330147,  43706],
       [   111,   3750]]))
('f1-score no PCA: ', 0.14615039850341993)


In [19]:
# print("Confusion matrix no PCA: ", confusion_matrix(y_test['class_pred'], y_pred))
# print("f1-score no PCA: ", f1_score(y_test['class_pred'], y_pred))

('Confusion matrix no PCA: ', array([[338678,  35175],
       [    78,   3783]]))
('f1-score no PCA: ', 0.17669726056190008)


In [26]:
rf_pca = RandomForestClassifier(n_estimators=100, min_samples_leaf = .1, random_state=0, class_weight='balanced')
print(cross_val_score(rf_pca, X_train_pca, y_train['class_pred'], cv=3))

rf_pca.fit(X_train_pca, y_train['class_pred'])
y_pred_pca = rf_pca.predict(X_test_pca)


print("Confusion matrix PCA: ", confusion_matrix(y_test['class_pred'], y_pred))
print("f1-score PCA: ", f1_score(y_test['class_pred'], y_pred))

[0.90786076 0.90864316 0.90481686]
('Confusion matrix PCA: ', array([[330147,  43706],
       [   111,   3750]]))
('f1-score PCA: ', 0.14615039850341993)


# Fit Lasso/Ridge/RF regression ensemble

In [28]:
X_Reg = X_test[y_pred_pca ==1]
y_Reg = y_test[y_pred_pca == 1]

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_Reg, y_Reg, test_size=0.33, shuffle = False)

In [49]:
#or train on same training as classification
X_train_reg = X_train[y_train['class_pred']==1]
y_train_reg = y_train[y_train['class_pred']==1]

X_test_reg = X_test[y_pred_pca ==1]
y_test_reg = y_test[y_pred_pca == 1]

In [50]:
rf = RandomForestRegressor(n_estimators = 100, max_features = 'log2', min_samples_leaf= 0.001) 
ridge = Ridge(alpha = .202)
lasso = Lasso(alpha = 1e-15)

rf.fit(X_train_reg, y_train_reg['totals.transactionRevenue'])
ridge.fit(X_train_reg, y_train_reg['totals.transactionRevenue'])
lasso.fit(X_train_reg, y_train_reg['totals.transactionRevenue'])

rf_pred = rf.predict(X_test_reg)
ridge_pred = ridge.predict(X_test_reg)
lasso_pred = lasso.predict(X_test_reg)



In [51]:
final_pred = pd.DataFrame({"random_forest": rf_pred, "ridge": ridge_pred, "lasso": lasso_pred}).mean(axis=1)

In [52]:
import math
mse_comb=mean_squared_error(y_test_reg['totals.transactionRevenue'], final_pred)
rmse_comb=math.sqrt(mse_comb)
print("Combined: root mean squared error", rmse_comb)

mse_rf=mean_squared_error(y_test_reg['totals.transactionRevenue'], rf_pred)
rmse_rf=math.sqrt(mse_rf)
print("RF: root mean squared error", rmse_rf)

mse_ridge=mean_squared_error(y_test_reg['totals.transactionRevenue'], ridge_pred)
rmse_ridge=math.sqrt(mse_ridge)
print("Ridge: root mean squared error", rmse_ridge)

mse_lasso=mean_squared_error(y_test_reg['totals.transactionRevenue'], lasso_pred)
rmse_lasso=math.sqrt(mse_lasso)
print("Lasso:root mean squared error", rmse_lasso)



('Combined: root mean squared error', 282378064.74971926)
('RF: root mean squared error', 167974414.1060032)
('Ridge: root mean squared error', 339250199.055082)
('Lasso:root mean squared error', 401394158.36334455)


In [54]:
import numpy as np
np.sort([rmse_comb, rmse_rf, rmse_ridge, rmse_lasso])

array([1.67974414e+08, 2.82378065e+08, 3.39250199e+08, 4.01394158e+08])

In [59]:
#Combine results like Kaggle wants

#y_test['final_prediction'] = y_pred_pca

total_pred = []
cnt = 0
for x in y_pred_pca: 
    if x == 0:
        total_pred.append(x)
    else:
        total_pred.append(final_pred[cnt])
        cnt +=1
        
y_test['final_prediction'] = total_pred

In [61]:
mse=mean_squared_error(y_test['totals.transactionRevenue'], y_test['final_prediction'])
rmse=math.sqrt(mse)
print("Final prediction per entry: ", rmse)

('Final prediction per entry: ', 90723344.43640944)


In [64]:
list(y_test)

['totals.transactionRevenue',
 'class_pred',
 'fullVisitorId',
 'final_prediction']

# Combine by User for Kaggle

In [76]:
group = y_test.groupby('fullVisitorId')

dfpredictions = group.apply(lambda x: x['final_prediction'].unique())
dftrue = group.apply(lambda x: x['totals.transactionRevenue'].unique())




In [88]:
dftrue.rename("true")
concatenated = pd.concat([dfpredictions, dftrue], axis=1)
concatenated.rename(columns={0: "predicted", 1: "true"},inplace = True)
concatenated.head()

Unnamed: 0_level_0,predicted,true
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1
4823595352351,[0.0],[0.0]
17196000342279,[0.0],[0.0]
18672749561458,[0.0],[0.0]
58970809397690,[0.0],[0.0]
59381693533730,[0.0],[0.0]


In [106]:
def sumSeries(df, col):
    newCol = []
    for x in df[col]:
#         print(np.sum(x) + 1)
        if np.sum(x) <0:
            newCol.append(math.log( 1))
        else:
            newCol.append(math.log(np.sum(x) + 1))
    return newCol

concatenated['predicted'] = sumSeries(concatenated, 'predicted')
print("done 1")
concatenated['true'] = sumSeries(concatenated, 'true')



done 1


In [107]:
mse=mean_squared_error(concatenated['true'],concatenated['predicted'])
rmse=math.sqrt(mse)
print(rmse)

5.4269493478


In [105]:
dfpredictions.iloc[68:70]

fullVisitorId
643479242968736    [0.0, -2403686.6071963185]
648792487151265                         [0.0]
dtype: object

In [None]:
df2.head()