In [179]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn import ensemble

In [180]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [181]:
categorical_columns = ["Product_ID","Gender","Age","Occupation","City_Category","Stay_In_Current_City_Years",
                       "Marital_Status","Product_Category_1","Product_Category_2","Product_Category_3"]

In [182]:
train_y = np.array(train["Purchase"])

In [183]:
train_X = train.copy()
test_X = test.copy()

In [184]:
train_X = train_X.fillna(-999)
test_X = test_X.fillna(-999)

In [185]:
#adding count feature 
from collections import Counter

#change directly in df
def count_feature(df_train=None, df_test=None , column=None):
    count_dict = Counter(df_train[column]) 
    
    if 'count_'+column not in df_train.columns:
        df_train['count_'+column] = df_train[column].map(count_dict)
        df_test['count_'+column] = df_test[column].map(count_dict)
    else:
        print("feature already present :{}".format('count_'+column))

In [186]:
count_feature(train_X,test_X, "User_ID" )
count_feature(train_X,test_X, "Product_ID" )
count_feature(train_X,test_X, "Occupation" )
count_feature(train_X,test_X, "Marital_Status" )
count_feature(train_X,test_X,  "Product_Category_1")
count_feature(train_X,test_X,  "Product_Category_2")
count_feature(train_X,test_X,  "Product_Category_3")

In [187]:
train_X.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,count_User_ID,count_Product_ID,count_Occupation,count_Marital_Status,count_Product_Category_1,count_Product_Category_2,count_Product_Category_3
0,1000001,P00069042,F,0-17,10,A,2,0,3,-999.0,-999.0,8370,35,227,12930,324731,20213,173638,383247
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200,35,581,12930,324731,140378,16466,18428
2,1000001,P00087842,F,0-17,10,A,2,0,12,-999.0,-999.0,1422,35,102,12930,324731,3947,173638,383247
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,-999.0,1057,35,341,12930,324731,3947,55108,383247
4,1000002,P00285442,M,55+,16,C,4+,0,8,-999.0,-999.0,7969,77,203,25371,324731,113925,173638,383247


In [188]:
#manual method in aggregate 

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_


#X_train.groupby('User_ID').agg({'Product_Category_1' : ['min', 'max', 'mean', percentile(0.25), percentile(0.75)])

In [189]:
temp = train_X.groupby('Product_ID').agg({'Purchase' : ['min', 'max', 'mean', percentile(0.25), percentile(0.75) ]})

temp.columns = ['Product_Purchase_min', "Product_Purchase_max", "Product_Purchase_mean",
                "Product_Purchase_per_25", "Product_Purchase_per_75"]

X_train = pd.merge(train_X,temp.reset_index(), how='left', left_on="Product_ID", right_on="Product_ID")
X_test =  pd.merge(test_X,temp.reset_index(), how='left', left_on="Product_ID", right_on="Product_ID")

In [190]:
temp = train_X.groupby('User_ID').agg({'User_ID' : ['min', 'max', 'mean', percentile(0.25), percentile(0.75) ]})

temp.columns = ['User_Purchase_min', "User_Purchase_max", "User_Purchase_mean",
                "User_Purchase_per_25", "User_Purchase_per_75"]

X_train = pd.merge(train_X,temp.reset_index(), how='left', left_on="User_ID", right_on="User_ID")
X_test =  pd.merge(test_X,temp.reset_index(), how='left', left_on="User_ID", right_on="User_ID")

In [191]:
X_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,...,count_Occupation,count_Marital_Status,count_Product_Category_1,count_Product_Category_2,count_Product_Category_3,User_Purchase_min,User_Purchase_max,User_Purchase_mean,User_Purchase_per_25,User_Purchase_per_75
0,1000001,P00069042,F,0-17,10,A,2,0,3,-999.0,...,12930,324731,20213,173638,383247,1000001,1000001,1000001,1000001.0,1000001.0
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,...,12930,324731,140378,16466,18428,1000001,1000001,1000001,1000001.0,1000001.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,-999.0,...,12930,324731,3947,173638,383247,1000001,1000001,1000001,1000001.0,1000001.0
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,...,12930,324731,3947,55108,383247,1000001,1000001,1000001,1000001.0,1000001.0
4,1000002,P00285442,M,55+,16,C,4+,0,8,-999.0,...,25371,324731,113925,173638,383247,1000002,1000002,1000002,1000002.0,1000002.0


In [192]:
#def label_encoding(df_train=None, df_test=None , var=None):
#    lb = preprocessing.LabelEncoder()
#    full_var_data = pd.concat((df_train[var],df_test[var]),axis=0)
#    lb.fit( full_var_data )
#    df_train[var] = lb.transform(df_train[var])
#    df_test[var] = lb.transform(df_test[var])


#label_encoding(train_X,test_X, "Product_ID" )
#label_encoding(train_X,test_X, "Gender" )
#label_encoding(train_X,test_X, "City_Category" )
#label_encoding(train_X,test_X, "Occupation" ) 

In [193]:
train_X.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
count_User_ID                   int64
count_Product_ID                int64
count_Occupation                int64
count_Marital_Status            int64
count_Product_Category_1        int64
count_Product_Category_2        int64
count_Product_Category_3        int64
dtype: object

In [194]:
categorical_columns = ["Product_ID","Gender","Age","Occupation","City_Category","Stay_In_Current_City_Years",
                       "Marital_Status","Product_Category_1","Product_Category_2","Product_Category_3"]

In [195]:
#Occupation Marital_Status  Product_Category_1
train_X["Occupation"] = train_X["Occupation"].astype('object')
train_X["Marital_Status"] = train_X["Marital_Status"].astype('object')
train_X["Product_Category_1"] = train_X["Product_Category_1"].astype('object')

In [196]:
for var in categorical_columns:
    lb = preprocessing.LabelEncoder()
    print(var)
    full_var_data = pd.concat((train_X[var],test_X[var]),axis=0)
    lb.fit( full_var_data )
    train_X[var] = lb.transform(train_X[var])
    test_X[var] = lb.transform(test_X[var])

Product_ID
Gender
Age
Occupation
City_Category
Stay_In_Current_City_Years
Marital_Status
Product_Category_1
Product_Category_2
Product_Category_3


In [203]:
train_X.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [198]:
## Dropping the unnecessary columns from IDVs ##
train_X = np.array( train_X.drop(['Purchase'],axis=1) )

In [199]:
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.05
params["seed"] = 0
plst = list(params.items())

In [233]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


num_rounds = 5667 

#pass ndarray
def manual_cv(X, y, X_test):
    kf = KFold(n_splits=3,shuffle=True,random_state=123)
    
    fold_score = {}
    k = 0
    
    y_test_fold = {}
    model_fold= {}
    for train_index, test_index in kf.split(X,y):
        X_model = X[train_index]
        y_model = y[train_index]
        X_val   = X[test_index]
        y_val   = y[test_index]
        
        #to hold test prediction per fold
        
        
        X_model_xgb = xgb.DMatrix(X_model, label=y_model, missing = -999)
        X_val_xgb =  xgb.DMatrix(X_val, label=y_val, missing = -999)
        
        x_test_xgb = xgb.DMatrix(X_test, missing = -999)
        
        watchlist = [ (X_model_xgb,'train'), (X_val_xgb,'eval')] #more than one will use last
        
        bst =  xgb.train(plst,X_model_xgb, num_rounds ,early_stopping_rounds=10, evals=watchlist, verbose_eval=10) #
        
        y_predict = bst.predict(X_val_xgb, ntree_limit=bst.best_ntree_limit) 
        y_test = bst.predict(x_test_xgb, ntree_limit=bst.best_ntree_limit)
        
        y_predict_train = bst.predict(X_model_xgb, ntree_limit=bst.best_ntree_limit)
        
        test_error = np.sqrt(mean_squared_error(y_val, y_predict))
        train_error = np.sqrt(mean_squared_error(y_model,  y_predict_train))
        
        
        print("train_rmse :{}\t test_score:{}".format(train_error, test_error))
        
        y_test_fold[k] = y_test
        
        model_fold[k] = bst
                
        fold_score[k] = test_error
        k+=1
    return model_fold, fold_score, y_test_fold

In [234]:
#xgtrain = xgb.DMatrix(train_X, label=train_y, missing = -999)
#xgtest = xgb.DMatrix(test_X,missing = -999)
#num_rounds = 5667
#model = xgb.train(plst, xgtrain, num_rounds)
#pred_test_y_xgb1 = model.predict(xgtest)

In [235]:
test_X = np.array( test_X.drop(['Purchase'],axis=1, errors='ignore') )

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [236]:
model_fold, fold_score, y_test_fold = manual_cv(train_X, train_y, test_X)

[0]	train-rmse:10068.7	eval-rmse:10047.4
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 10 rounds.
[10]	train-rmse:6465.44	eval-rmse:6450.04
[20]	train-rmse:4484.65	eval-rmse:4473.77
[30]	train-rmse:3500.05	eval-rmse:3493.43
[40]	train-rmse:3059.93	eval-rmse:3057.35
[50]	train-rmse:2876.09	eval-rmse:2876.65
[60]	train-rmse:2797.63	eval-rmse:2800.97
[70]	train-rmse:2760.43	eval-rmse:2765.71
[80]	train-rmse:2740.69	eval-rmse:2747.85
[90]	train-rmse:2727.99	eval-rmse:2736.61
[100]	train-rmse:2717.96	eval-rmse:2727.98
[110]	train-rmse:2710.98	eval-rmse:2722.29
[120]	train-rmse:2704.55	eval-rmse:2717.08
[130]	train-rmse:2699.13	eval-rmse:2712.85
[140]	train-rmse:2693.02	eval-rmse:2707.89
[150]	train-rmse:2687.64	eval-rmse:2703.55
[160]	train-rmse:2680.3	eval-rmse:2697.17
[170]	train-rmse:2674.88	eval-rmse:2692.67
[180]	train-rmse:2669.67	eval-rmse:2688.5
[190]	train-rmse:2665.35	eval-rmse:2685.02
[200]	trai

[1870]	train-rmse:2354.46	eval-rmse:2508.88
[1880]	train-rmse:2353.55	eval-rmse:2508.61
[1890]	train-rmse:2352.57	eval-rmse:2508.43
[1900]	train-rmse:2351.37	eval-rmse:2508.16
[1910]	train-rmse:2350.54	eval-rmse:2507.99
[1920]	train-rmse:2349.86	eval-rmse:2507.81
[1930]	train-rmse:2348.98	eval-rmse:2507.58
[1940]	train-rmse:2347.84	eval-rmse:2507.23
[1950]	train-rmse:2346.72	eval-rmse:2506.86
[1960]	train-rmse:2345.78	eval-rmse:2506.57
[1970]	train-rmse:2344.86	eval-rmse:2506.31
[1980]	train-rmse:2343.72	eval-rmse:2505.89
[1990]	train-rmse:2342.68	eval-rmse:2505.6
[2000]	train-rmse:2341.7	eval-rmse:2505.35
[2010]	train-rmse:2340.87	eval-rmse:2505.09
[2020]	train-rmse:2339.83	eval-rmse:2504.8
[2030]	train-rmse:2338.78	eval-rmse:2504.55
[2040]	train-rmse:2337.92	eval-rmse:2504.29
[2050]	train-rmse:2336.98	eval-rmse:2504.11
[2060]	train-rmse:2336.1	eval-rmse:2503.81
[2070]	train-rmse:2335.33	eval-rmse:2503.6
[2080]	train-rmse:2334.51	eval-rmse:2503.38
[2090]	train-rmse:2333.66	eval-rmse:2

[780]	train-rmse:2493.99	eval-rmse:2572.71
[790]	train-rmse:2492.55	eval-rmse:2572.11
[800]	train-rmse:2490.94	eval-rmse:2571.22
[810]	train-rmse:2488.96	eval-rmse:2570.06
[820]	train-rmse:2487.01	eval-rmse:2568.9
[830]	train-rmse:2485.51	eval-rmse:2568.25
[840]	train-rmse:2483.94	eval-rmse:2567.5
[850]	train-rmse:2482.35	eval-rmse:2566.69
[860]	train-rmse:2480.94	eval-rmse:2566
[870]	train-rmse:2479.14	eval-rmse:2565.03
[880]	train-rmse:2477.77	eval-rmse:2564.43
[890]	train-rmse:2475.95	eval-rmse:2563.56
[900]	train-rmse:2473.76	eval-rmse:2562.19
[910]	train-rmse:2472.23	eval-rmse:2561.43
[920]	train-rmse:2471.06	eval-rmse:2560.98
[930]	train-rmse:2469.61	eval-rmse:2560.3
[940]	train-rmse:2468.22	eval-rmse:2559.68
[950]	train-rmse:2466.96	eval-rmse:2559.33
[960]	train-rmse:2465.43	eval-rmse:2558.56
[970]	train-rmse:2464.37	eval-rmse:2558.23
[980]	train-rmse:2462.68	eval-rmse:2557.45
[990]	train-rmse:2460.82	eval-rmse:2556.51
[1000]	train-rmse:2459.1	eval-rmse:2555.58
[1010]	train-rmse

[2670]	train-rmse:2285.39	eval-rmse:2500.26
[2680]	train-rmse:2284.59	eval-rmse:2500.14
[2690]	train-rmse:2283.96	eval-rmse:2500.11
[2700]	train-rmse:2283.13	eval-rmse:2499.92
[2710]	train-rmse:2282.38	eval-rmse:2499.81
[2720]	train-rmse:2281.52	eval-rmse:2499.65
[2730]	train-rmse:2280.8	eval-rmse:2499.56
[2740]	train-rmse:2279.98	eval-rmse:2499.37
[2750]	train-rmse:2279.24	eval-rmse:2499.23
[2760]	train-rmse:2278.63	eval-rmse:2499.17
[2770]	train-rmse:2277.64	eval-rmse:2498.96
[2780]	train-rmse:2276.78	eval-rmse:2498.77
[2790]	train-rmse:2276.04	eval-rmse:2498.67
[2800]	train-rmse:2275.21	eval-rmse:2498.53
[2810]	train-rmse:2274.45	eval-rmse:2498.44
[2820]	train-rmse:2273.49	eval-rmse:2498.27
[2830]	train-rmse:2272.85	eval-rmse:2498.22
[2840]	train-rmse:2271.9	eval-rmse:2498.05
[2850]	train-rmse:2271.23	eval-rmse:2497.95
[2860]	train-rmse:2270.66	eval-rmse:2497.86
[2870]	train-rmse:2269.81	eval-rmse:2497.64
[2880]	train-rmse:2269.12	eval-rmse:2497.56
[2890]	train-rmse:2268.31	eval-rms

[1110]	train-rmse:2442.3	eval-rmse:2539.24
[1120]	train-rmse:2441.03	eval-rmse:2538.68
[1130]	train-rmse:2439.77	eval-rmse:2538.19
[1140]	train-rmse:2438.38	eval-rmse:2537.59
[1150]	train-rmse:2436.87	eval-rmse:2536.88
[1160]	train-rmse:2435.52	eval-rmse:2536.35
[1170]	train-rmse:2434.28	eval-rmse:2535.88
[1180]	train-rmse:2433.12	eval-rmse:2535.36
[1190]	train-rmse:2431.85	eval-rmse:2534.94
[1200]	train-rmse:2430.4	eval-rmse:2534.3
[1210]	train-rmse:2428.88	eval-rmse:2533.76
[1220]	train-rmse:2427.51	eval-rmse:2533.05
[1230]	train-rmse:2426.34	eval-rmse:2532.65
[1240]	train-rmse:2424.95	eval-rmse:2532.05
[1250]	train-rmse:2423.37	eval-rmse:2531.29
[1260]	train-rmse:2422.13	eval-rmse:2530.84
[1270]	train-rmse:2420.94	eval-rmse:2530.33
[1280]	train-rmse:2419.92	eval-rmse:2529.94
[1290]	train-rmse:2419.02	eval-rmse:2529.6
[1300]	train-rmse:2417.74	eval-rmse:2529.12
[1310]	train-rmse:2416.46	eval-rmse:2528.61
[1320]	train-rmse:2415.5	eval-rmse:2528.27
[1330]	train-rmse:2414.17	eval-rmse:2

[2990]	train-rmse:2259.08	eval-rmse:2489.77
[3000]	train-rmse:2258.27	eval-rmse:2489.67
[3010]	train-rmse:2257.53	eval-rmse:2489.56
[3020]	train-rmse:2256.72	eval-rmse:2489.47
[3030]	train-rmse:2255.96	eval-rmse:2489.42
[3040]	train-rmse:2255.04	eval-rmse:2489.26
[3050]	train-rmse:2254.21	eval-rmse:2489.09
[3060]	train-rmse:2253.53	eval-rmse:2489.02
[3070]	train-rmse:2252.71	eval-rmse:2488.96
[3080]	train-rmse:2251.8	eval-rmse:2488.76
[3090]	train-rmse:2251.03	eval-rmse:2488.65
[3100]	train-rmse:2250.2	eval-rmse:2488.5
[3110]	train-rmse:2249.49	eval-rmse:2488.46
Stopping. Best iteration:
[3106]	train-rmse:2249.79	eval-rmse:2488.46

train_rmse :2249.788232816847	 test_score:2488.456615406325


Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [237]:
model_fold

{0: <xgboost.core.Booster at 0x137b472d588>,
 1: <xgboost.core.Booster at 0x137b472d748>,
 2: <xgboost.core.Booster at 0x137b472d240>}

In [238]:
fold_score

{0: 2491.9030071362276, 1: 2492.8776744604706, 2: 2488.456615406325}

In [239]:
y_test_fold

{0: array([14580.013, 10946.438,  7110.686, ..., 11031.276, 17750.178,
         2613.072], dtype=float32),
 1: array([15634.911 , 11220.359 ,  6894.729 , ...,  8840.743 , 18836.645 ,
         2018.2335], dtype=float32),
 2: array([15650.751 , 10321.371 ,  7314.3193, ...,  9467.656 , 18294.773 ,
         2376.5637], dtype=float32)}

In [248]:
y_test_fold[0].shape

(233599,)

In [252]:
y_target = (y_test_fold[0] + y_test_fold[1] + y_test_fold[2])/3

In [254]:
test["Purchase"] = y_target

In [255]:
test[['User_ID','Product_ID','Purchase']].to_csv('Solution.csv',columns = ['User_ID','Product_ID','Purchase'],index = False)

In [256]:
pd.read_csv("data/Solution.csv").head()

Unnamed: 0,User_ID,Product_ID,Purchase
0,1000004,P00128942,15288.559
1,1000009,P00113442,10829.39
2,1000010,P00288442,7106.578
3,1000010,P00145342,2302.0996
4,1000011,P00053842,1675.6019
