# PCA for aggregate features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df_trn = pd.read_csv('../data/train-agg.csv')
df_tst = pd.read_csv('../data/test-agg.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
df_trn['AmountPositive'] = df_trn['Amount'].apply(lambda x: x if x > 0 else 0)
df_trn['AmountNegative'] = df_trn['Amount'].apply(lambda x: x if x < 0 else 0)

df_tst['AmountPositive'] = df_tst['Amount'].apply(lambda x: x if x > 0 else 0)
df_tst['AmountNegative'] = df_tst['Amount'].apply(lambda x: x if x < 0 else 0)

In [4]:
columns4drop = [
    'TransactionId',
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'CurrencyCode',
    'CountryCode',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Amount',
    'TransactionStartTime',
    'PricingStrategy'
]

In [5]:
df_trn_cut = df_trn.drop(columns=columns4drop, axis=1)
df_tst_cut = df_tst.drop(columns=columns4drop, axis=1)

In [6]:
df_trn_cut.to_csv('../data/train-agg-cut.csv', encoding='utf-8', index=False)
df_tst_cut.to_csv('../data/test-agg-cut.csv',  encoding='utf-8', index=False)

In [11]:
X_trn = df_trn_cut.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn_cut['FraudResult']

X_tst = df_tst_cut

In [25]:
from sklearn.ensemble import BaggingClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

In [26]:
for n_components in range(2, 32, 2):
    X_pca_trn = PCA(n_components=n_components).fit(X_trn).transform(X_trn)
    BaggingClassifier().fit(X_pca_trn, y_trn)
    cv_results = cross_validate(BaggingClassifier(), X_pca_trn, y_trn,
                                scoring=('precision', 'recall', 'f1'), cv=3, n_jobs=-1)
    print('PCA(%2i): f1-score=%6.4f' % (n_components, cv_results['test_f1'].mean()))

PCA( 2): f1-score=0.0025
PCA( 4): f1-score=0.0077
PCA( 6): f1-score=0.0110
PCA( 8): f1-score=0.0017
PCA(10): f1-score=0.0019
PCA(12): f1-score=0.0084
PCA(14): f1-score=0.0011
PCA(16): f1-score=0.0008
PCA(18): f1-score=0.0056
PCA(20): f1-score=0.0019
PCA(22): f1-score=0.0029
PCA(24): f1-score=0.0031
PCA(26): f1-score=0.0054
PCA(28): f1-score=0.0020
PCA(30): f1-score=0.0062


In [38]:
for n_components in range(32, 256, 2):
    X_pca_trn = PCA(n_components=n_components).fit(X_trn).transform(X_trn)
    BaggingClassifier().fit(X_pca_trn, y_trn)
    cv_results = cross_validate(BaggingClassifier(), X_pca_trn, y_trn,
                                scoring=('precision', 'recall', 'f1'), cv=3, n_jobs=-1)
    print('PCA(%2i): f1-score=%6.4f' % (n_components, cv_results['test_f1'].mean()))

PCA(32): f1-score=0.0013
PCA(34): f1-score=0.0060
PCA(36): f1-score=0.0031
PCA(38): f1-score=0.0020
PCA(40): f1-score=0.0046
PCA(42): f1-score=0.0109
PCA(44): f1-score=0.0034
PCA(46): f1-score=0.0115
PCA(48): f1-score=0.0253
PCA(50): f1-score=0.0009
PCA(52): f1-score=0.0018
PCA(54): f1-score=0.0046
PCA(56): f1-score=0.0008
PCA(58): f1-score=0.0009
PCA(60): f1-score=0.0005
PCA(62): f1-score=0.0019
PCA(64): f1-score=0.0062
PCA(66): f1-score=0.0015
PCA(68): f1-score=0.0008
PCA(70): f1-score=0.0018
PCA(72): f1-score=0.0007
PCA(74): f1-score=0.0007
PCA(76): f1-score=0.0005
PCA(78): f1-score=0.0018
PCA(80): f1-score=0.0081
PCA(82): f1-score=0.0021
PCA(84): f1-score=0.0062
PCA(86): f1-score=0.0189
PCA(88): f1-score=0.0120
PCA(90): f1-score=0.0114
PCA(92): f1-score=0.0016
PCA(94): f1-score=0.0028
PCA(96): f1-score=0.2327
PCA(98): f1-score=0.2518
PCA(100): f1-score=0.2742
PCA(102): f1-score=0.3273
PCA(104): f1-score=0.2516
PCA(106): f1-score=0.2182
PCA(108): f1-score=0.3010
PCA(110): f1-score=0

In [28]:
X_trn.shape

(95662, 498)

In [35]:
pca = PCA(n_components=32)
X_pca_trn = pca.fit(X_trn).transform(X_trn)

In [36]:
np.shape(X_pca_trn)

(95662, 32)

In [37]:
X_inv = pca.inverse_transform(X_pca_trn)
np.shape(X_inv)

(95662, 498)

In [None]:
pca = PCA(n_components=1)
pca.fit(X)
X_pca = pca.transform(X)
print("original shape:   ", X.shape)
print("transformed shape:", X_pca.shape)