In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'xgboost'

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('Sample_Submission_Tm9Lura.csv')

submission['User_ID'] = test['User_ID']
submission['Product_ID'] = test['Product_ID']

### Check data

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
len(train['User_ID'].unique())

In [None]:
len(train['Product_ID'].unique())

In [None]:
len(test['Product_ID'].unique())

In [None]:
train['Age'].unique()

In [None]:
train['Occupation'].unique()

In [None]:
train['City_Category'].unique()

In [None]:
train['Stay_In_Current_City_Years'].unique()

In [None]:
train['Product_Category_1'].unique()

In [None]:
train['Product_Category_2'].unique()

In [None]:
train['Product_Category_3'].unique()

### Preprocess Input Data

In [None]:
# User_ID data preprocess. e.g. 1000002 -> 2
train['User_ID'] = train['User_ID'] - 1000000
test['User_ID'] = test['User_ID'] - 1000000

enc = LabelEncoder()
train['User_ID'] = enc.fit_transform(train['User_ID'])
test['User_ID'] = enc.transform(test['User_ID'])

In [None]:
# Product_ID preprocess e.g. P00069042 -> 69042
train['Product_ID'] = train['Product_ID'].str.replace('P00', '')
test['Product_ID'] = test['Product_ID'].str.replace('P00', '')

scaler = StandardScaler()
train['Product_ID'] = scaler.fit_transform(train['Product_ID'].reshape(-1, 1))
test['Product_ID'] = scaler.transform(test['Product_ID'].reshape(-1, 1))

# enc = LabelEncoder()
# train['Product_ID'] = enc.fit_transform(train['Product_ID'])
# test['Product_ID'] = enc.transform(test['Product_ID'])

Note: Test Product_ID has new values. Thus LabelEncoder won't work directly.

In [None]:
len(train['Product_ID'].unique())

In [None]:
len(test['Product_ID'].unique())

In [None]:
cat_col = ['Gender', 'City_Category']
num_col = ['Age', 'Occupation', 'Stay_In_Current_City_Years', 'Product_Category_1', 
           'Product_Category_2', 'Product_Category_3']

In [None]:
# Impute missing values

train = train.fillna(0)
test = test.fillna(0)

In [None]:
# Modify age column

train['Age'] = train['Age'].map({'0-17': 15,
                               '18-25': 21,
                               '26-35': 30,
                               '36-45': 40,
                               '46-50': 48,
                               '51-55': 53,
                               '55+': 60})
test['Age'] = test['Age'].map({'0-17': 15,
                               '18-25': 21,
                               '26-35': 30,
                               '36-45': 40,
                               '46-50': 48,
                               '51-55': 53,
                               '55+': 60})

In [None]:
# Modify Stay_In_Current_City_Years

train['Stay_In_Current_City_Years'] = train['Stay_In_Current_City_Years'].map({'0': 0,
                                                                               '1': 1,
                                                                                '2': 2,
                                                                                '3': 3,
                                                                                '4+': 4})
test['Stay_In_Current_City_Years'] = test['Stay_In_Current_City_Years'].map({'0': 0,
                                                                               '1': 1,
                                                                                '2': 2,
                                                                                '3': 3,
                                                                                '4+': 4})

In [None]:
# Encode categorical columns

encoder = LabelEncoder()

for col in cat_col:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])

In [None]:
# Scale numerical columns

scaler = StandardScaler()

for col in num_col:
    train[col] = scaler.fit_transform(train[col].reshape(-1, 1))
    test[col] = scaler.transform(test[col].reshape(-1, 1))

In [None]:
train.head()

###  Start training algorithms

In [None]:
X = train.drop(['Purchase'], axis=1)
y = train[['Purchase']]
X_test = test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

Tried models:

lin_reg = LinearRegression() # rmse = 4609.92

tree_reg = DecisionTreeRegressor(random_state=0) #rmse = 3786.33

forest_reg = RandomForestRegressor(n_estimators=25, random_state=0) # rmse = 2786.273

ada_reg = AdaBoostRegressor(n_estimators=25, random_state=0) # rmse = 3855.36

gradient_reg = GradientBoostingRegressor(n_estimators=40,  learning_rate=1.0, random_state=0) # rmse=2829.88(40, 1.0)

In [None]:
# Validating the model

xgb_reg = XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))

print xgb_reg, rmse

In [None]:
# Training using entire data to improve accuracy

xgb_reg.fit(X, y)
predict = xgb_reg.predict(X_test)

submission['Purchase'] = predict
submission.to_csv('Sample_Submission_Tm9Lura.csv', index=False)

Public LB Score: 2574.95