In [361]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_curve,auc,roc_auc_score
from sklearn.metrics import average_precision_score

In [362]:
#set display option for rows & columns
pd.set_option( 'display.max_rows', None)
pd.set_option('display.max_columns', None )

In [363]:
train = pd.read_csv('data/train_afterEDA.csv')

test = pd.read_csv('data/test_afterEDA.csv')

## D Features

Negative D features should be removed

In [364]:
D_features = ['D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15']
total_negative_count = 0

for feature in D_features:
    # fill NaNs with a large positive number to avoid false negatives
    negative_indices = train[train[feature].fillna(0) < 0.0].index
    count_neg = len(negative_indices)
    total_negative_count += count_neg

    if count_neg > 0:
        print(f"{feature} has {count_neg} negative values at indices:\n{negative_indices.tolist()}\n")

print(f"Total negative entries across all D features: {total_negative_count}")


D4 has 15 negative values at indices:
[2947, 4210, 5264, 5501, 6057, 9120, 110692, 268149, 444539, 444543, 445567, 446475, 455648, 456213, 473385]

D6 has 3 negative values at indices:
[5501, 268149, 473385]

D11 has 7 negative values at indices:
[2947, 3814, 4932, 358329, 359690, 442479, 456213]

D12 has 2 negative values at indices:
[5501, 473385]

D14 has 3 negative values at indices:
[4085, 4097, 5501]

D15 has 15 negative values at indices:
[2947, 3034, 4932, 5501, 6057, 7589, 13149, 268149, 359690, 442435, 442458, 442479, 455648, 456213, 473385]

Total negative entries across all D features: 45


In [365]:
D_features = ['D4', 'D6', 'D11', 'D12', 'D14', 'D15']

# Create a boolean mask for any negative values across the selected D features
negative_mask = (train[D_features] < 0.0).any(axis=1)

# Drop all rows with negative values in any of the specified D features
train = train[~negative_mask]

print("Data points after removing negative D features:", train.shape[0])



Data points after removing negative D features: 590494


## M Features

In [366]:
train.drop(["M1"], axis=1, inplace=True)
test.drop(["M1"], axis=1, inplace=True)

train.drop(["M2"], axis=1, inplace=True)
test.drop(["M2"], axis=1, inplace=True)

train.drop(["M7"], axis=1, inplace=True)
test.drop(["M7"], axis=1, inplace=True)



## V Features

Remove V features using LGBM 

In [367]:
vfeatures=[]
for features in train.columns:
  if features.startswith("V"):
    vfeatures.append(features)
train_v = train[vfeatures]    

In [368]:
for features in vfeatures:
  train_v[features].fillna(train_v[features].median(), inplace=True)

  

In [369]:
V_x = train_v[:int(train.shape[0]*0.80)]
V_cv = train_v[int(train.shape[0]*0.80):]

train_y = train["isFraud"].values[:int(train.shape[0]*0.80)]
cv_y = train["isFraud"].values[int(train.shape[0]*0.80):]

In [370]:
clf = LGBMClassifier()
clf.fit( V_x, train_y)
print("Train AUC :",roc_auc_score(train_y, clf.predict_proba(V_x)[:,1]))
print("CV AUC :",roc_auc_score(cv_y, clf.predict_proba(V_cv)[:,1]))


#clf = LGBMClassifier()
#clf.fit(V_x, train_y)

# Predict probabilities
#train_probs = clf.predict_proba(V_x)[:, 1]
#cv_probs = clf.predict_proba(V_cv)[:, 1]

# AUPRC scores
#print("Train AUPRC:", average_precision_score(train_y, train_probs))
#print("CV AUPRC:", average_precision_score(cv_y, cv_probs))

[LightGBM] [Info] Number of positive: 16598, number of negative: 455797
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23171
[LightGBM] [Info] Number of data points in the train set: 472395, number of used features: 338
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035136 -> initscore=-3.312765
[LightGBM] [Info] Start training from score -3.312765
Train AUC : 0.8791281346333822
CV AUC : 0.8475793700148698


In [371]:

Vremove=[]
for j,i in enumerate(clf.feature_importances_):
  print(i)
  if i<20:
    Vremove.append(vfeatures[j])  
len(Vremove)  

0
3
11
12
7
3
0
1
2
0
0
20
27
0
1
0
2
1
18
30
0
1
6
14
2
10
0
0
2
5
0
1
7
12
15
32
10
32
2
9
0
5
2
24
30
1
12
3
26
4
7
9
54
19
17
23
3
11
3
3
16
26
4
11
0
1
30
0
9
15
2
2
7
18
19
26
17
18
7
6
20
27
35
1
11
12
31
0
0
4
14
3
1
17
7
23
1
0
25
3
4
12
7
0
11
0
0
3
12
0
0
7
0
0
11
0
0
0
0
0
0
0
4
17
2
13
16
3
9
40
24
6
16
2
1
9
5
2
28
7
2
0
11
2
5
3
9
0
9
8
4
25
0
1
0
19
5
15
15
18
1
8
4
6
20
3
0
0
16
4
14
3
2
3
1
1
0
13
2
6
2
2
0
3
0
0
28
7
11
2
0
2
3
1
0
0
2
4
0
14
6
0
12
4
6
8
11
14
25
6
1
11
3
7
7
2
4
1
10
7
13
9
1
19
1
7
7
0
0
3
0
6
2
9
1
1
1
4
3
2
0
2
3
1
7
2
1
1
0
0
5
1
2
1
0
8
5
25
7
7
21
7
4
8
4
19
18
8
5
9
10
2
3
7
2
6
10
6
7
3
34
47
46
2
64
2
7
5
17
5
31
6
4
29
2
19
1
2
2
3
6
1
29
1
0
15
51
20
20
55
17
44
69
24
51
9
27
14
2
46
5
2
4
1
0
3
3
0
8
0
5
5
5
1
13
10
0
1
1


294

In [372]:
Vremove

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V37',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V46',
 'V47',
 'V48',
 'V50',
 'V51',
 'V52',
 'V54',
 'V55',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V63',
 'V64',
 'V65',
 'V66',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V77',
 'V78',
 'V79',
 'V80',
 'V84',
 'V85',
 'V86',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V97',
 'V98',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V127',
 'V128',
 'V129',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V138',
 'V140',
 'V141',
 'V142',
 'V143',


In [373]:
#dropping above listed V features
print(len(vfeatures))
for v in Vremove:
    vfeatures.remove(v)
print(len(vfeatures))    

339
45


In [374]:
train.drop(Vremove,axis=1,inplace=True)
test.drop(Vremove, axis=1,inplace=True)

In [375]:
train.shape

(590494, 137)

## Id Features

In [376]:
id_train_df = train[['id_01','id_02','id_03','id_04','id_05','id_06','id_07','id_08','id_09','id_10','id_11','id_12','id_13','id_14','id_15','id_16','id_17','id_18','id_19','id_20','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_28','id_29','id_30','id_31','id_32','id_33','id_34','id_35','id_36','id_37','id_38']]

In [377]:
percent_missing = id_train_df.isnull().sum() * 100 / len(id_train_df)
missing_value_df = pd.DataFrame({'column_name': id_train_df.columns,
                                 'percent_missing': percent_missing})

In [378]:
missing_value_df.sort_values('percent_missing', inplace=True)

In [379]:
missing_value_df

Unnamed: 0,column_name,percent_missing
id_01,id_01,75.576043
id_12,id_12,75.576043
id_36,id_36,76.126091
id_35,id_35,76.126091
id_37,id_37,76.126091
id_15,id_15,76.126091
id_38,id_38,76.126091
id_29,id_29,76.127276
id_11,id_11,76.127276
id_28,id_28,76.127276


Apart from the important features which contribute in classification, I will remove all the columns with more that 80% of missing values from train and similarly for test dataset.

In [380]:
train.drop(["id_03","id_04","id_07","id_08","id_09","id_10","id_14","id_18","id_21","id_22","id_23","id_24","id_25","id_26","id_27"], axis = 1, inplace=True)
#test.drop(["id_03","id_04","id_07","id_08","id_09","id_10","id_14","id_18","id_21","id_22","id_23","id_24","id_25","id_26","id_27"], axis = 1, inplace=True)

In [381]:
print(f"Shape of training data after feature engineering: {train.shape}")


Shape of training data after feature engineering: (590494, 122)


In [382]:
train.to_csv("data/train_afterFE.csv", index = False)
test.to_csv("data/test_afterFE.csv", index = False)