### Load Libraries

In [1]:
import pandas as pd
import numpy as np
import gc
import random
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm_notebook

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import xgboost as xgb
import lightgbm as lgb

### Load Train and Test data

In [2]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
traintest = pd.concat([train_df, test_df], axis = 0)

### Training data details

In [3]:
train_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [5]:
train_df.describe()

Unnamed: 0,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,...,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,5944923.0,14654.93,1390.895,26722.45,4530.164,26409.96,30708.11,16865.22,4669.208,2569407.0,...,467605.7,444623.9,805621.9,781296.6,143.529939,121380.9,35734.51,312374.1,92199.6,227910.0
std,8234312.0,389329.8,64283.02,569965.2,235912.4,1514730.0,577059.0,751275.6,187944.9,9610183.0,...,4068038.0,4428889.0,4513246.0,6839451.0,9584.318507,4720709.0,1614622.0,4318501.0,1635993.0,1811139.0
min,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2260000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40000000.0,20000000.0,4000000.0,20000000.0,14800000.0,100000000.0,20708000.0,40000000.0,10400000.0,319612000.0,...,76000000.0,123588000.0,130000000.0,144400000.0,640000.0,301312000.0,106420000.0,140000000.0,61768000.0,43200000.0


### Test Data Details

In [6]:
test_df.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


In [8]:
test_df.describe()

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
count,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,...,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0
mean,57737.87,62587.26,103675.2,62898.53,67133.54,80838.79,61810.14,55157.52,1406324.0,81286.68,...,119391.0,135595.5,324221.7,143785.6,93023.67,80471.45,60768.65,132321.0,167576.6,128248.7
std,1745182.0,2322787.0,2586951.0,2765941.0,3206124.0,2845031.0,2780137.0,1923517.0,6872366.0,2378938.0,...,3115190.0,2598454.0,3782996.0,3663374.0,5041000.0,2100210.0,2040655.0,3592018.0,3761816.0,2413798.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,150444700.0,228329500.0,275817100.0,397262100.0,466759100.0,285222300.0,486375100.0,204329000.0,343565800.0,231016700.0,...,535169200.0,123654700.0,379339800.0,402548000.0,965753000.0,168006500.0,249791300.0,320000000.0,318630000.0,218978200.0


### Check for missing values

In [9]:
train_df.columns[train_df.isnull().sum() != 0].size

0

In [10]:
test_df.columns[test_df.isnull().sum() != 0].size

0

### Remove Constant features

In [11]:
feats_counts = train_df.nunique(dropna = False)
const_cols = feats_counts.loc[feats_counts==1].index.tolist()

In [12]:
# remove constant features in the train and test set
train_df.drop(const_cols, axis=1, inplace=True)
test_df.drop(const_cols, axis=1, inplace=True) 

#print("Removed `{}` Constant Columns\n".format(len(const_cols)))
#print(const_cols)

### Remove Duplicated features

In [13]:
# Check and remove duplicate columns
colsToRemove = []
colsScaned = []
dupList = {}

columns = train_df.columns

for i in range(len(columns)-1):
    v = train_df[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v, train_df[columns[j]].values):
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j])
                dupList[columns[i]] = dupCols
                
# remove duplicate columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True) 

# remove duplicate columns in the testing set
test_df.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(dupList)))
print(dupList)

Removed `4` Duplicate Columns

{'34ceb0081': ['d60ddde1b'], '8d57e2749': ['acc5b709d', 'f333a5f60'], '168b3e5bc': ['f8d75792f'], 'a765da8bc': ['912836770']}


In [14]:
# saving them, as it takes long time to find them.

#import _pickle as pickle
#pickle.dump(dupList, open('dup_cols.p', 'wb'))

In [15]:
gc.collect()


7

### Preparing Data, Generating new features using aggregation and dimensionality reduction

In [16]:
X_train = train_df.drop(["ID", "target"], axis=1)
X_train_std = StandardScaler().fit_transform(X_train)
y_train = np.log1p(train_df["target"].values)

X_test = test_df.drop(["ID"], axis=1)
X_test_std = StandardScaler().fit_transform(X_test)

In [17]:
## Applying K-Means for clustering

for ncl in range(2,11):
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(X_train)
    X_train['kmeans_'+str(ncl)] = cls.predict(X_train)
    X_test['kmeans_'+str(ncl)] = cls.predict(X_test)

In [18]:
## New features using statistical Aggregation

X_train['Me_an']   = X_train.mean(axis=1)
X_train['Med_ian'] = X_train.median(axis=1)
X_train['Mo_de']   = X_train.mode(axis=1)
X_train['Ma_x']    = X_train.max(axis=1)
X_train['Va_r']    = X_train.var(axis=1)
X_train['St_d']    = X_train.std(axis=1)
        
X_test['Me_an']   = X_test.mean(axis=1)
X_test['Med_ian'] = X_test.median(axis=1)
X_test['Mo_de']   = X_test.mode(axis=1)
X_test['Ma_x']    = X_test.max(axis=1)
X_test['Va_r']    = X_test.var(axis=1)
X_test['St_d']    = X_test.std(axis=1)

In [19]:
#Get number of componenets for dimension reduction using pca
def _get_number_components(model, threshold):
    component_variance = model.explained_variance_ratio_
    explained_variance = 0.0
    components = 0

    for var in component_variance:
        explained_variance += var
        components += 1
        if(explained_variance >= threshold):
            break
    return components

### Get the optimal number of components
pca = PCA()
train_pca = pca.fit_transform(X_train_std)
components = _get_number_components(pca, threshold=0.90)

In [20]:
## Apply PCA

pca = PCA(n_components = components)
Xtrain_pca = pca.fit_transform(X_train_std)
Xtest_pca = pca.transform(X_test_std)

In [21]:
## Apply Truncated SVD 

tsvd = TruncatedSVD(n_components = components)
Xtrain_svd = tsvd.fit_transform(X_train_std)
Xtest_svd = tsvd.transform(X_test_std)

In [22]:
## Add the decomposed features (pca and svd) in the train & test dataset

def _add_decomposition(df, decomp, ncomp, flag):
    for i in range(1, ncomp+1):
        df[flag+"_"+str(i)] = decomp[:, i - 1]

_add_decomposition(X_train, Xtrain_pca, 1000, 'pca')
_add_decomposition(X_train, Xtrain_svd, 1000, 'svd')


_add_decomposition(X_test, Xtest_pca, 1000, 'pca')
_add_decomposition(X_test, Xtest_svd, 1000, 'svd')

In [23]:
## create the lists of decomposed and non decomposed features 

orig_features = [x for x in X_train.columns if "_" not in x]
decom_features = [x for x in X_train.columns if "_" in x]

## Split into train and validation data
dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.30, random_state = 42)

In [24]:
## Find important original features using Random Forests 

model1 = RandomForestRegressor(n_jobs=-1, random_state=42)
model1.fit(X_train[orig_features], y_train)
importances = model1.feature_importances_

## get list of important features 
orig_importances_df = pd.DataFrame({'importance': importances, 'feature': orig_features})
orig_importances_df = orig_importances_df.sort_values(by=['importance'], ascending=[False])
orig_important_features = orig_importances_df[:750]['feature'].values.tolist()

In [25]:
## Find important decomposed features using Random Forests 

model2 = RandomForestRegressor(n_jobs=-1, random_state=42)
model2.fit(X_train[decom_features], y_train)
importances = model2.feature_importances_

## get list of important features 
decom_importances_df = pd.DataFrame({'importance': importances, 'feature': decom_features})
decom_importances_df = decom_importances_df.sort_values(by=['importance'], ascending=[False])
decom_important_features = decom_importances_df[:750]['feature'].values.tolist()

In [26]:
## Group important features

important_features = orig_important_features + decom_important_features

## Training with LightGBM

In [27]:
## Create model with important features

def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(train_X[important_features], label=train_y)
    lgval = lgb.Dataset(val_X[important_features], label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, valid_sets=[lgtrain, lgval], early_stopping_rounds=100, verbose_eval=100, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_X[important_features], num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [28]:
# Training LGB
pred_test_lgb, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
print("LightGBM Training Completed...")

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.48623	valid_1's rmse: 1.57442
[200]	training's rmse: 1.29999	valid_1's rmse: 1.50909
[300]	training's rmse: 1.15956	valid_1's rmse: 1.47725
[400]	training's rmse: 1.04527	valid_1's rmse: 1.45853
[500]	training's rmse: 0.94783	valid_1's rmse: 1.44767
[600]	training's rmse: 0.86375	valid_1's rmse: 1.44139
[700]	training's rmse: 0.790464	valid_1's rmse: 1.43771
[800]	training's rmse: 0.72567	valid_1's rmse: 1.4359
[900]	training's rmse: 0.667261	valid_1's rmse: 1.43472
[1000]	training's rmse: 0.614677	valid_1's rmse: 1.43439
[1100]	training's rmse: 0.567213	valid_1's rmse: 1.43392
Early stopping, best iteration is:
[1099]	training's rmse: 0.567649	valid_1's rmse: 1.43391
LightGBM Training Completed...


## Training With XGBoost

In [29]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'t',
          'eta': 0.005,
          'max_depth': 15, 
          'subsample': 0.7, 
          'colsample_bytree': 0.5,
          'alpha':0,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X[important_features], train_y)
    va_data = xgb.DMatrix(val_X[important_features], val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X[important_features])
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [30]:
# Training XGB
pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
print("XGB Training Completed...")

[0]	train-rmse:14.0349	valid-rmse:14.0181
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 30 rounds.
[100]	train-rmse:8.60185	valid-rmse:8.587
[200]	train-rmse:5.3413	valid-rmse:5.36201
[300]	train-rmse:3.37948	valid-rmse:3.49674
[400]	train-rmse:2.17954	valid-rmse:2.4659
[500]	train-rmse:1.43047	valid-rmse:1.9266
[600]	train-rmse:0.955689	valid-rmse:1.66603
[700]	train-rmse:0.650107	valid-rmse:1.54635
[800]	train-rmse:0.450218	valid-rmse:1.49291
[900]	train-rmse:0.318528	valid-rmse:1.46836
[1000]	train-rmse:0.229327	valid-rmse:1.45683
[1100]	train-rmse:0.168969	valid-rmse:1.45166
[1200]	train-rmse:0.127006	valid-rmse:1.4489
[1300]	train-rmse:0.096771	valid-rmse:1.44729
[1400]	train-rmse:0.075007	valid-rmse:1.4465
[1500]	train-rmse:0.058802	valid-rmse:1.44599
[1600]	train-rmse:0.046728	valid-rmse:1.44565
[1700]	train-rmse:0.037689	valid-rmse:1.44543
[1800]	train-rmse:0.030641	valid-rmse:1.44535
[1900]

In [31]:
#pred_test = pred_test_xgb
pred_test = (pred_test_lgb + pred_test_xgb)/2

In [32]:
submission = pd.DataFrame({ 'ID': test_df.ID,
                            'target': pred_test})

In [33]:
submission.to_csv("submission.csv", index=False)