# Transacton Value Prediction

## Steps

### Data Load and Preprocess

- 1) load_train_fs : Load Training data
- 2) load_test_fs  : Load Test Data
- 3) train_type    : Process Training data
- 4) test_type	 : Process Test data

### Feature Selection
- 5) getTopFeatures : Get Top 100 Features

### Feature Extraction
- 6) feature_minus_pair_list
- 7) feature_plus_pair_list
- 8) feature_mul_pair_list
- 9) feature_divide_pair_list
- 10) feature_pair_sub_mul_list

- 11) Generate Labels: toLabels()

### Modeling and Evaluation
- 12) Fit Model : GradientBoostingClassifier()
- 13) Predict : gbc_svr_predict2()
- 14) Evaluate: get_evaluation_matrices()

In [92]:
# Standard Imports
import warnings
warnings.filterwarnings('ignore')
#import os
import numpy as np
#import scipy.stats as stats
import pandas as pd

In [93]:
df_data = pd.read_csv('train.csv')
df_data.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [94]:
df_data.shape

(4459, 4993)

In [95]:
# Missing Value Check
df_data.columns[df_data.isnull().any()]

Index([], dtype='object')

In [96]:
df_dataY = df_data[['target']]
df_dataY.head()

Unnamed: 0,target
0,38000000.0
1,600000.0
2,10000000.0
3,2000000.0
4,14400000.0


In [97]:
df_dataX = df_data.iloc[:,2:]
df_dataX.shape

(4459, 4991)

In [98]:
df_dataX.head(10)

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,0.0,0,0.0,0,0,0,0,0,2200000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,0.0,0,0.0,0,0,0,0,0,2000000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
5,0.0,0,0.0,0,0,0,0,0,17020000.0,0.0,...,12000.0,5600000.0,20000000.0,0,0,0,0,0,0,11000
6,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,40000,0,0,0
7,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
8,0.0,0,0.0,0,0,0,0,0,58000.0,0.0,...,0.0,0.0,4000000.0,0,0,0,0,0,0,0
9,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,400000


## Preprocessing
#### Log Transformation

In [99]:
# Import PowerTransformer
from sklearn.preprocessing import PowerTransformer

# Instantiate PowerTransformer
pow_trans = PowerTransformer()

# Train the transform on the data
#pow_trans.fit(so_numeric_df[['ConvertedSalary']])
pow_trans.fit(df_dataX)

# Apply the power transform to the data
#so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])
df_dataX_log = pow_trans.transform(df_dataX)

In [100]:
df_dataX_log.shape

(4459, 4991)

#### Standardization

In [108]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Instantiate StandardScaler
SS_scaler = StandardScaler()

# Fit SS_scaler to the data
SS_scaler.fit(df_dataX_log)

# Transform the data using the fitted scaler
arr_dataX_std = SS_scaler.transform(df_dataX_log)

In [109]:
# Convert Array to Dataframe
df_dataX_std = pd.DataFrame(arr_dataX_std, columns=df_dataX.columns)

# Compare the origional and transformed column
df_dataX_std.head()

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,-0.093934,-0.029964,-0.086348,-0.021183,-0.033505,-0.103212,-0.033505,-0.033505,-0.708546,-0.168426,...,-0.26423,-0.20805,-0.428967,-0.196025,-0.014977,-0.088946,-0.063664,-0.087656,-0.135163,-0.194788
1,-0.093934,-0.029964,-0.086348,-0.021183,-0.033505,-0.103212,-0.033505,-0.033505,1.428267,-0.168426,...,-0.26423,-0.20805,-0.428967,-0.196025,-0.014977,-0.088946,-0.063664,-0.087656,-0.135163,-0.194788
2,-0.093934,-0.029964,-0.086348,-0.021183,-0.033505,-0.103212,-0.033505,-0.033505,-0.708546,-0.168426,...,-0.26423,-0.20805,-0.428967,-0.196025,-0.014977,-0.088946,-0.063664,-0.087656,-0.135163,-0.194788
3,-0.093934,-0.029964,-0.086348,-0.021183,-0.033505,-0.103212,-0.033505,-0.033505,-0.708546,-0.168426,...,-0.26423,-0.20805,-0.428967,-0.196025,-0.014977,-0.088946,-0.063664,-0.087656,-0.135163,-0.194788
4,-0.093934,-0.029964,-0.086348,-0.021183,-0.033505,-0.103212,-0.033505,-0.033505,1.424326,-0.168426,...,-0.26423,-0.20805,-0.428967,-0.196025,-0.014977,-0.088946,-0.063664,-0.087656,-0.135163,-0.194788


## Feature Selection

#### Feature with No variance

In [110]:
df_descr = df_dataX_std.describe().transpose()
df_descr.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
48df886f9,4459.0,-1.977873e-16,1.000112,-0.093934,-0.093934,-0.093934,-0.093934,10.645813
0deb4b6a8,4459.0,-8.870860000000001e-17,1.000112,-0.029964,-0.029964,-0.029964,-0.029964,33.372893
34b15f335,4459.0,7.21452e-16,1.000112,-0.086348,-0.086348,-0.086348,-0.086348,11.581071
a8cb14b00,4459.0,-1.810127e-15,1.000112,-0.021183,-0.021183,-0.021183,-0.021183,47.206991
2f0771a37,4459.0,-3.220618e-16,1.000112,-0.033505,-0.033505,-0.033505,-0.033505,29.846273


In [111]:
idxNoVar = df_descr[df_descr['std'] == 0].index
idxNoVar

Index(['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee',
       '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca',
       ...
       'd196ca1fd', 'a8e562e8e', 'eb6bb7ce1', '5beff147e', '52b347cdc',
       '4600aadcf', '6fa0b9dab', '43d70cc4d', '408021ef8', 'e29d22b59'],
      dtype='object', length=256)

In [113]:
df_dataX_std.drop(idxNoVar, axis=1, inplace=True)
df_dataX_std.shape

(4459, 4735)

#### Correlated Features

In [114]:
# Correlation Matrix (Positive correlation matrix)
df_corr = df_dataX_std.corr().abs()
df_corr

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
48df886f9,1.000000,0.002815,0.104296,0.001990,0.003147,0.009695,0.003147,0.140787,0.031824,0.013591,...,0.053194,0.007465,0.025959,0.018413,0.159444,0.008355,0.005980,0.008234,0.023594,0.018297
0deb4b6a8,0.002815,1.000000,0.002587,0.000635,0.001004,0.003093,0.222829,0.001004,0.005410,0.005047,...,0.007918,0.006234,0.012854,0.005874,0.000449,0.002665,0.001908,0.083490,0.004050,0.005837
34b15f335,0.104296,0.002587,1.000000,0.001829,0.002893,0.008912,0.002893,0.002893,0.016941,0.017410,...,0.051343,0.017965,0.006322,0.016926,0.001293,0.007680,0.005497,0.022511,0.011671,0.002877
a8cb14b00,0.001990,0.000635,0.001829,1.000000,0.000710,0.002186,0.000710,0.000710,0.015009,0.003568,...,0.005597,0.004407,0.009087,0.004152,0.000317,0.001884,0.001349,0.001857,0.002863,0.004126
2f0771a37,0.003147,0.001004,0.002893,0.000710,1.000000,0.003458,0.001123,0.001123,0.023740,0.005643,...,0.008853,0.006971,0.014373,0.006568,0.000502,0.002980,0.002133,0.002937,0.004529,0.006526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71b203550,0.008355,0.002665,0.007680,0.001884,0.002980,0.040589,0.002980,0.002980,0.023810,0.016053,...,0.027944,0.045213,0.060022,0.144113,0.001332,1.000000,0.034417,0.021418,0.026269,0.172257
137efaa80,0.005980,0.001908,0.005497,0.001349,0.002133,0.028062,0.002133,0.002133,0.050227,0.032468,...,0.112061,0.093171,0.001989,0.006257,0.000954,0.034417,1.000000,0.035079,0.018041,0.025292
fb36b89d9,0.008234,0.083490,0.022511,0.001857,0.002937,0.009047,0.002937,0.002937,0.030728,0.142651,...,0.133414,0.020548,0.062134,0.003525,0.001313,0.021418,0.035079,1.000000,0.011848,0.017074
7e293fbaf,0.023594,0.004050,0.011671,0.002863,0.004529,0.013950,0.004529,0.004529,0.057755,0.173238,...,0.135302,0.192159,0.166116,0.331512,0.110808,0.026269,0.018041,0.011848,1.000000,0.198743


In [115]:
# Create a True/False mask and apply it (we will eliminate one half of the correlation matrix)
mask = np.triu(np.ones_like(df_corr, dtype=bool))
mask

array([[ True,  True,  True, ...,  True,  True,  True],
       [False,  True,  True, ...,  True,  True,  True],
       [False, False,  True, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ..., False,  True,  True],
       [False, False, False, ..., False, False,  True]])

In [116]:
# Get only True values
df_corr_mask = df_corr.mask(mask)
df_corr_mask

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
48df886f9,,,,,,,,,,,...,,,,,,,,,,
0deb4b6a8,0.002815,,,,,,,,,,...,,,,,,,,,,
34b15f335,0.104296,0.002587,,,,,,,,,...,,,,,,,,,,
a8cb14b00,0.001990,0.000635,0.001829,,,,,,,,...,,,,,,,,,,
2f0771a37,0.003147,0.001004,0.002893,0.000710,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71b203550,0.008355,0.002665,0.007680,0.001884,0.002980,0.040589,0.002980,0.002980,0.023810,0.016053,...,0.027944,0.045213,0.060022,0.144113,0.001332,,,,,
137efaa80,0.005980,0.001908,0.005497,0.001349,0.002133,0.028062,0.002133,0.002133,0.050227,0.032468,...,0.112061,0.093171,0.001989,0.006257,0.000954,0.034417,,,,
fb36b89d9,0.008234,0.083490,0.022511,0.001857,0.002937,0.009047,0.002937,0.002937,0.030728,0.142651,...,0.133414,0.020548,0.062134,0.003525,0.001313,0.021418,0.035079,,,
7e293fbaf,0.023594,0.004050,0.011671,0.002863,0.004529,0.013950,0.004529,0.004529,0.057755,0.173238,...,0.135302,0.192159,0.166116,0.331512,0.110808,0.026269,0.018041,0.011848,,


In [117]:
# List column names of highly correlated features (r > 0.95)
lst_to_drop = [c for c in df_corr_mask.columns if any(df_corr_mask[c] > 0.95)]
#lst_to_drop

In [118]:
len(lst_to_drop)

47

In [119]:
# Drop the features in the to_drop list
df_x_reduced = df_dataX_std.drop(lst_to_drop, axis=1)

print("The reduced_df dataframe has {} columns".format(df_x_reduced.shape[1]))

The reduced_df dataframe has 4688 columns


#### Model Based Feature Selection
**Refer next version notebook for this implementation**

## Feature Extraction

## Modeling
#### Train Test Split

In [124]:
from sklearn.model_selection import train_test_split

In [125]:
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(df_x_reduced, df_dataY, test_size=.3, random_state=123)

#### Model-1 : Linear Regression

In [132]:
from sklearn.linear_model import LinearRegression

In [133]:
lr = LinearRegression()

In [134]:
lr.fit(df_x_train, df_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [135]:
lr_preds = lr.predict(df_x_test)

#### Model-2 : GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [126]:
gbr = GradientBoostingRegressor()

In [127]:
gbr.fit(df_x_train, df_y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [128]:
gbr_preds = gbr.predict(df_x_test)

## Evaluation

In [129]:
# Import performance measurement functions
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [136]:
# Calculate metrics for testing data
rmse_test = np.sqrt(mean_squared_error(df_y_test, lr_preds))
mae_test = mean_absolute_error(df_y_test, lr_preds)

# Print performance metrics
print('RMSE test: {:.3f}\n MAE test: {:.3f}'.format(
rmse_test, mae_test))

RMSE test: 3012650838933101.500
 MAE test: 1563746922923765.750


In [137]:
# Calculate metrics for testing data
rmse_test = np.sqrt(mean_squared_error(df_y_test, gbr_preds))
mae_test = mean_absolute_error(df_y_test, gbr_preds)

# Print performance metrics
print('RMSE test: {:.3f}\n MAE test: {:.3f}'.format(
rmse_test, mae_test))

RMSE test: 7023310.997
 MAE test: 4949476.256


# Misc

In [140]:
df_y_test.describe()

Unnamed: 0,target
count,1338.0
mean,5914495.0
std,8042653.0
min,30000.0
25%,600000.0
50%,2400000.0
75%,8000000.0
max,40000000.0


In [87]:
6000000
7023310
40000000

array([4, 3, 1, 0, 5, 6, 2])