# Plot Decision Tree

- Original Data
- Number Encoding
- Label Encoding
- One Hot Encoding
- Probability Encoding
- 5 'super features' from top20 $\chi^2$
- 18 first of the top20 $\chi^2$

In [1]:
import numpy  as np
import pandas as pd

In [2]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

In [3]:
def dtc_fit():
    clf = DecisionTreeClassifier(
        criterion='entropy', 
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=None
    #     max_leaf_nodes=None  # Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.
    ).fit(X_trn, y_trn)
        
    return clf

In [4]:
def dtc_fit_and_plot():
    plt.figure(figsize=(72,36))
    clf = DecisionTreeClassifier(
        criterion='entropy', 
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=None
    #     max_leaf_nodes=None  # Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.
    ).fit(X_trn, y_trn)
    plot_tree(clf, filled=True)
    plt.show()

In [5]:
def feature_importances_print(clf):
    # Calculate feature importances
    importances = clf.feature_importances_
    
    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Rearrange feature names so they match the sorted feature importances
    names = [X_trn.columns[i] for i in indices]

    # Print the feature ranking
    print("Feature ranking:")

    for i in range(X_trn.shape[1]):
        if importances[indices[i]] > 0.0:
            print("%3d. %30s: %.10f" % (i+1, names[i], importances[indices[i]]))

In [6]:
def feature_importances_print_and_plot(clf):
    # Calculate feature importances
    importances = clf.feature_importances_
    
    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Rearrange feature names so they match the sorted feature importances
    names = [X_trn.columns[i] for i in indices]

    # Print the feature ranking
    print("Feature ranking:")

    for i in range(X_trn.shape[1]):
        if importances[indices[i]] > 0.0:
            print("%3d. %30s: %.10f" % (i+1, names[i], importances[indices[i]]))
            
    # Create plot
    plt.figure(figsize=(12,8))

    # Create plot title
    plt.title("Feature Importance")

    # Add bars
    plt.bar(range(X_trn.shape[1]), importances[indices])

    # Add feature names as x-axis labels
    plt.xticks(range(X_trn.shape[1]), names, rotation=90)

    # Show plot
    plt.show()

In [7]:
import graphviz
from sklearn import tree

In [8]:
def dtc_to_pdf(clf, f_name):
    dot_data = tree.export_graphviz(
        clf, 
        out_file=None,
        feature_names=X_trn.columns,
        filled=True,
        rounded=True,
        special_characters=True
    )
    graph = graphviz.Source(dot_data)
    graph.render('../myNotebooks/DecisionTrees/' + f_name)

In [9]:
import os
from collections import Counter

In [10]:
# определяем был ли ранее точно такой же результат
def is_predict_exist(df):
    current_subm_set = set(df[df['FraudResult'] == 1].index.tolist())

    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            print('It is the same as in: ' + f)
            is_exist = True
    return is_exist

## Original Data

In [11]:
df_trn = pd.read_csv('../data/training.csv')
df_tst = pd.read_csv('../data/test.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [12]:
X_trn = df_trn.drop(columns=['CurrencyCode', 'CountryCode', 'ProductCategory', 'TransactionStartTime', 'FraudResult'], axis=1)
X_tst = df_tst.drop(columns=['CurrencyCode', 'CountryCode', 'ProductCategory', 'TransactionStartTime'               ], axis=1)

y_trn = df_trn['FraudResult']

In [13]:
X_trn.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ChannelId,Amount,Value,PricingStrategy
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,ProviderId_6,ProductId_10,ChannelId_3,1000.0,1000,2
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,ProviderId_4,ProductId_6,ChannelId_2,-20.0,20,2
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,ProviderId_6,ProductId_1,ChannelId_3,500.0,500,2
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,ProviderId_1,ProductId_21,ChannelId_3,20000.0,21800,2
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,ProviderId_4,ProductId_6,ChannelId_2,-644.0,644,2


In [14]:
# ValueError: could not convert string to float: 'TransactionId_76871'
# dtc = dtc_fit()

## Num Encoding

In [15]:
df_trn = pd.read_csv('../data/training_num.csv')
df_tst = pd.read_csv('../data/test_num.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [16]:
X_trn = df_trn.drop(columns=['ProductCategory', 'TransactionStartTime', 'FraudResult'], axis=1)
X_tst = df_tst.drop(columns=['ProductCategory', 'TransactionStartTime'               ], axis=1)

y_trn = df_trn['FraudResult']

In [17]:
X_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ChannelId,Amount,Value,PricingStrategy
0,36123,3957,887,4406,6,10,3,1000.0,1000,2
1,15642,4841,3829,4406,4,6,2,-20.0,20,2
2,53941,4229,222,4683,6,1,3,500.0,500,2
3,102363,648,2185,988,1,21,3,20000.0,21800,2
4,38780,4841,3829,988,4,6,2,-644.0,644,2


In [18]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-num-010-features')

Feature ranking:
  1.                          Value: 0.8049481074
  2.                         Amount: 0.0893740669
  3.                        BatchId: 0.0317878438
  4.                 SubscriptionId: 0.0265620861
  5.                      AccountId: 0.0185416698
  6.                     CustomerId: 0.0171805802
  7.                     ProviderId: 0.0090918652
  8.                      ProductId: 0.0013746073
  9.                PricingStrategy: 0.0011391732


In [19]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_num_010.csv', encoding='utf-8', index=False)

Counter({0: 44953, 1: 66})
New result!


**Result:** `65   0.703703703703704`

### Оставим только 'логичные' features

In [20]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProductCategory',
    'TransactionStartTime'
]
X_trn = df_trn.drop(columns=columns4drop, axis=1)
X_tst = df_tst.drop(columns=columns4drop, axis=1)

X_trn = X_trn.drop(columns=['FraudResult'], axis=1)

In [21]:
X_trn.head()

Unnamed: 0,ProviderId,ProductId,ChannelId,Amount,Value,PricingStrategy
0,6,10,3,1000.0,1000,2
1,4,6,2,-20.0,20,2
2,6,1,3,500.0,500,2
3,1,21,3,20000.0,21800,2
4,4,6,2,-644.0,644,2


In [22]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-num-006-features')

Feature ranking:
  1.                          Value: 0.8821218489
  2.                         Amount: 0.0818062812
  3.                     ProviderId: 0.0219282799
  4.                      ProductId: 0.0067519049
  5.                PricingStrategy: 0.0062353404
  6.                      ChannelId: 0.0011563446


In [23]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_num_006.csv', encoding='utf-8', index=False)

Counter({0: 44946, 1: 73})
New result!


**Result:** `74   0.688524590163934`

## Label Encoding

In [24]:
df_trn = pd.read_csv('../data/training_le.csv')
df_tst = pd.read_csv('../data/test_le.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [25]:
X_trn = df_trn.drop(columns=['TransactionStartTime', 'FraudResult'], axis=1)
X_tst = df_tst.drop(columns=['TransactionStartTime'               ], axis=1)

y_trn = df_trn['FraudResult']

In [26]:
X_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,36122,3956,886,4405,5,9,0,2,1000.0,1000,2
1,15641,4840,3828,4405,3,5,2,1,-20.0,20,2
2,53940,4228,221,4682,5,0,0,2,500.0,500,2
3,102362,647,2184,987,0,20,9,2,20000.0,21800,2
4,38779,4840,3828,987,3,5,2,1,-644.0,644,2


In [27]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-LE-011-features')

Feature ranking:
  1.                          Value: 0.8306119249
  2.                         Amount: 0.0613539279
  3.                        BatchId: 0.0289683819
  4.                 SubscriptionId: 0.0273961886
  5.                     CustomerId: 0.0210882269
  6.                      AccountId: 0.0124253133
  7.                     ProviderId: 0.0090918652
  8.                ProductCategory: 0.0051757834
  9.                PricingStrategy: 0.0025137805
 10.                      ProductId: 0.0013746073


In [28]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_LE_011.csv', encoding='utf-8', index=False)

Counter({0: 44954, 1: 65})
New result!


**Result:** `64   0.703703703703704`

### Оставим только 'логичные' features

In [29]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'TransactionStartTime'
]
X_trn = df_trn.drop(columns=columns4drop, axis=1)
X_tst = df_tst.drop(columns=columns4drop, axis=1)

X_trn = X_trn.drop(columns=['FraudResult'], axis=1)

In [30]:
X_trn.head()

Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,5,9,0,2,1000.0,1000,2
1,3,5,2,1,-20.0,20,2
2,5,0,0,2,500.0,500,2
3,0,20,9,2,20000.0,21800,2
4,3,5,2,1,-644.0,644,2


In [31]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-LE-007-features')

Feature ranking:
  1.                          Value: 0.8871534479
  2.                         Amount: 0.0764916382
  3.                     ProviderId: 0.0219282799
  4.                PricingStrategy: 0.0054752804
  5.                ProductCategory: 0.0047327550
  6.                      ProductId: 0.0031042323
  7.                      ChannelId: 0.0011143663


In [32]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_LE_007.csv', encoding='utf-8', index=False)

Counter({0: 44945, 1: 74})
New result!


**Result:** `74   0.690909090909091`

## One Hot Encoding

In [33]:
df_trn = pd.read_csv('../data/training_ohe_short.csv')
df_tst = pd.read_csv('../data/test_ohe_short.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [34]:
X_trn = df_trn.drop(columns=['TransactionStartTime', 'FraudResult'], axis=1)
X_tst = df_tst.drop(columns=['TransactionStartTime'               ], axis=1)

y_trn = df_trn['FraudResult']

In [35]:
X_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,Amount,Value,ProviderId_0,ProviderId_1,ProviderId_2,ProviderId_3,...,ProductCategory_9,ChannelId_0,ChannelId_1,ChannelId_2,ChannelId_3,ChannelId_4,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_3
0,36122,3956,886,4405,1000.0,1000,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,15641,4840,3828,4405,-20.0,20,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
2,53940,4228,221,4682,500.0,500,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,102362,647,2184,987,20000.0,21800,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
4,38779,4840,3828,987,-644.0,644,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0


In [36]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-OHE-058-features')

Feature ranking:
  1.                          Value: 0.8293089165
  2.                         Amount: 0.0658393629
  3.                     CustomerId: 0.0275072198
  4.                        BatchId: 0.0270248791
  5.                 SubscriptionId: 0.0207476142
  6.                   ProviderId_2: 0.0153381585
  7.                      AccountId: 0.0041823298
  8.                    ProductId_9: 0.0035200448
  9.                   ProviderId_4: 0.0026430866
 10.                    ChannelId_0: 0.0013746073
 11.              ProductCategory_0: 0.0013746073
 12.              PricingStrategy_0: 0.0011391732


In [37]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_OHE_058.csv', encoding='utf-8', index=False)

Counter({0: 44954, 1: 65})
It is the same as in: AlBo0816_DT_LE_011.csv


**Result:** `66   0.678571428571428`

### Оставим только 'логичные' features

In [38]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'TransactionStartTime'
]
X_trn = df_trn.drop(columns=columns4drop, axis=1)
X_tst = df_tst.drop(columns=columns4drop, axis=1)

X_trn = X_trn.drop(columns=['FraudResult'], axis=1)

In [39]:
X_trn.head()

Unnamed: 0,Amount,Value,ProviderId_0,ProviderId_1,ProviderId_2,ProviderId_3,ProviderId_4,ProviderId_5,ProductId_0,ProductId_1,...,ProductCategory_9,ChannelId_0,ChannelId_1,ChannelId_2,ChannelId_3,ChannelId_4,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_3
0,1000.0,1000,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
1,-20.0,20,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,500.0,500,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,1,0
3,20000.0,21800,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
4,-644.0,644,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [40]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-OHE-054-features')

Feature ranking:
  1.                          Value: 0.8858869306
  2.                         Amount: 0.0770119378
  3.                   ProviderId_2: 0.0169442015
  4.              PricingStrategy_2: 0.0046853078
  5.                    ProductId_9: 0.0033010440
  6.                   ProviderId_4: 0.0030404036
  7.                   ProviderId_0: 0.0028780395
  8.                    ProductId_2: 0.0016104595
  9.                    ChannelId_2: 0.0014634143
 10.              PricingStrategy_0: 0.0012127699
 11.              ProductCategory_2: 0.0011154623
 12.                    ChannelId_0: 0.0008496473
 13.                   ProviderId_1: 0.0000003818


In [41]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_OHE_054.csv', encoding='utf-8', index=False)

Counter({0: 44946, 1: 73})
It is the same as in: AlBo0816_DT_num_006.csv


## Probability Encoding

In [42]:
df_trn = pd.read_csv('../data/training_pe.csv')
df_tst = pd.read_csv('../data/test_pe.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [43]:
X_trn = df_trn.drop(columns=['BatchId', 'TransactionStartTime', 'FraudResult'], axis=1)
X_tst = df_tst.drop(columns=['BatchId', 'TransactionStartTime'               ], axis=1)

y_trn = df_trn['FraudResult']

In [44]:
X_trn.head()

Unnamed: 0,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,0.0,0.0,0.0,8.8e-05,0.00039,0.0004,0.003232,1000.0,1000,0.001741
1,3.2e-05,3.1e-05,0.0,0.000131,3.1e-05,0.003546,0.000135,-20.0,20,0.001741
2,0.0,0.0,0.0,8.8e-05,0.0,0.0004,0.003232,500.0,500,0.001741
3,0.0,0.0,0.0,0.010101,0.002646,0.00625,0.003232,20000.0,21800,0.001741
4,3.2e-05,3.1e-05,0.0,0.000131,3.1e-05,0.003546,0.000135,-644.0,644,0.001741


In [45]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-PE-010-features')

Feature ranking:
  1.                          Value: 0.8221848669
  2.                      AccountId: 0.0909695996
  3.                         Amount: 0.0345014076
  4.                     CustomerId: 0.0187692620
  5.                 SubscriptionId: 0.0155127882
  6.                     ProviderId: 0.0133547267
  7.                      ProductId: 0.0024385665
  8.                PricingStrategy: 0.0022687826


In [46]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_PE_010.csv', encoding='utf-8', index=False)

Counter({0: 44958, 1: 61})
New result!


**Result:** `61   0.666666666666667`

### Оставим только 'логичные' features

In [47]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'TransactionStartTime'
]
X_trn = df_trn.drop(columns=columns4drop, axis=1)
X_tst = df_tst.drop(columns=columns4drop, axis=1)

X_trn = X_trn.drop(columns=['FraudResult'], axis=1)

In [48]:
X_trn.head()

Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,8.8e-05,0.00039,0.0004,0.003232,1000.0,1000,0.001741
1,0.000131,3.1e-05,0.003546,0.000135,-20.0,20,0.001741
2,8.8e-05,0.0,0.0004,0.003232,500.0,500,0.001741
3,0.010101,0.002646,0.00625,0.003232,20000.0,21800,0.001741
4,0.000131,3.1e-05,0.003546,0.000135,-644.0,644,0.001741


In [49]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-PE-007-features')

Feature ranking:
  1.                          Value: 0.8643917556
  2.                         Amount: 0.0993409293
  3.                     ProviderId: 0.0215410726
  4.                      ProductId: 0.0070489018
  5.                PricingStrategy: 0.0047392899
  6.                ProductCategory: 0.0017817084
  7.                      ChannelId: 0.0011563425


In [50]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_OHE_007.csv', encoding='utf-8', index=False)

Counter({0: 44945, 1: 74})
It is the same as in: AlBo0816_DT_LE_007.csv


### 5 'super features' from top20 $\chi^2$

In [51]:
df_trn = pd.read_csv('../data/train-agg-cut.csv')
df_tst = pd.read_csv('../data/test-agg-cut.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [52]:
columns5 = [
    'AmountPositive',                                                           #01
#     'Value',                                                                    #02
    'account_product_transactions__AmountPositive_global_sum',                  #03
    'account_provider_transactions__AmountPositive_global_avg',                 #04
#     'account_product_category_transactions__AmountPositive_global_sum',         #05
#     'account_provider_transactions__Value_global_avg',                          #06
#     'account_provider_transactions__AmountPositive_global_sum',                 #07
#     'account_channel_transactions__AmountPositive_global_sum',                  #08
#     'account_product_transactions__AmountPositive_global_avg',                  #09
#     'account_transactions__AmountPositive_global_sum',                          #10
#     'account_pricing_strategy_transactions__AmountPositive_global_sum',         #11
#     'account_product_category_transactions__AmountPositive_global_avg',         #12
    'account_product_transactions__AmountPositive_week_sum',                    #13
#     'account_provider_transactions__AmountPositive_week_avg',                   #14
#     'account_pricing_strategy_transactions__AmountPositive_global_avg',         #15
#     'account_product_transactions__Value_global_avg',                           #16
#     'account_product_category_transactions__Value_global_avg',                  #17
    'account_channel_transactions__AmountPositive_global_avg',                  #18
#     'account_transactions__AmountPositive_global_avg',                          #19
#     'account_provider_transactions__AmountPositive_week_sum'                    #20
]

In [53]:
X_trn = df_trn[columns5]
y_trn = df_trn['FraudResult']

X_tst = df_tst[columns5]

In [54]:
X_trn.head()

Unnamed: 0,AmountPositive,account_product_transactions__AmountPositive_global_sum,account_provider_transactions__AmountPositive_global_avg,account_product_transactions__AmountPositive_week_sum,account_channel_transactions__AmountPositive_global_avg
0,1000.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,500.0,0.0,0.0,0.0,0.0
3,20000.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


In [55]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-top20chi2-005-features')

Feature ranking:
  1.                 AmountPositive: 0.9076541431
  2. account_product_transactions__AmountPositive_global_sum: 0.0434348625
  3. account_channel_transactions__AmountPositive_global_avg: 0.0389385613
  4. account_provider_transactions__AmountPositive_global_avg: 0.0092533725
  5. account_product_transactions__AmountPositive_week_sum: 0.0007190605


In [56]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_top20chi2_005.csv', encoding='utf-8', index=False)

Counter({0: 44934, 1: 85})
New result!


**Result:** `85   0.71875`

### Применим Random Forest

In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
rfc = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=24)

In [59]:
rfc.fit(X_trn, y_trn)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=24, verbose=0,
                       warm_start=False)

In [61]:
predict = rfc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_RF_top20_005.csv', encoding='utf-8', index=False)

Counter({0: 44952, 1: 67})
New result!


**Result:** `67   0.727272727272727`

In [62]:
rfc.score(X_trn, y_trn)

0.9996550354372687

In [63]:
rfc.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=1975759266, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=1264583555, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                        max_features='auto', max_leaf_nodes=None,
                        mi

In [64]:
rfc_00 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=1975759266, splitter='best')

In [66]:
rfc_00_fit = rfc_00.fit(X_trn, y_trn)
dtc_to_pdf(rfc_00_fit, 'DecisionTree-rfc-00')

### другие настройки Random Forest

In [75]:
rfc = RandomForestClassifier(
    n_estimators=200,                  # default=10
    criterion='gini', 
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,                 # default=”auto”
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
#     min_impurity_split=1e-7,
    bootstrap=True,
    oob_score=True,                    # default=False
    n_jobs=-1,                         # default=None    
    random_state=24,                   # default=None
    verbose=1,                         # default=0
    warm_start=False,
    class_weight='balanced_subsample'  # default=None
)

In [76]:
rfc.fit(X_trn, y_trn)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    3.2s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
                       criterion='gini', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=200, n_jobs=-1, oob_score=True,
                       random_state=24, verbose=1, warm_start=False)

In [81]:
rfc.feature_importances_

array([0.96119242, 0.00651088, 0.01263442, 0.00096702, 0.01869525])

In [82]:
rfc.oob_score_

0.9994459660053103

In [83]:
predict = rfc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0819_RF_top20_005_v2.csv', encoding='utf-8', index=False)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 200 out of 200 | elapsed:    0.1s finished


Counter({0: 44944, 1: 75})
New result!


**Result:** `75  0.754098360655738`

### Grid Search Parameters for Random Forest

In [86]:
from sklearn.model_selection import GridSearchCV

In [87]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [10, 20, 40, 80, 100, 200, 400, 1000],
    'criterion'   : ['gini', 'entropy'],
    'max_depth'   : [4, 6, 8, 10, 12, None],
    'max_features': ['auto', None],   
    'bootstrap'   : [True, False],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

In [88]:
# Create a based model
rfc = RandomForestClassifier(n_jobs=-1)

In [90]:
# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator = rfc,
    param_grid = param_grid,
    cv = 3,
    n_jobs = -1,
    verbose = 2
)

In [92]:
# Fit the grid search to the data
grid_search.fit(X_trn, y_trn)

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 353 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 636 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 1001 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 1446 tasks      | elapsed: 26.4min
[Parallel(n_jobs=-1)]: Done 1973 tasks      | elapsed: 36.7min
[Parallel(n_jobs=-1)]: Done 2580 tasks      | elapsed: 49.7min
[Parallel(n_jobs=-1)]: Done 3269 tasks      | elapsed: 66.4min
[Parallel(n_jobs=-1)]: Done 3456 out of 3456 | elapsed: 71.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=Fals...
                                              warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'bootstrap': [True, False],
    

In [93]:
grid_search.best_params_

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 4,
 'max_features': None,
 'n_estimators': 10}

**Fine Tuning Grid Search**

In [94]:
# Create the parameter grid based on the results of random search 
param_grid_fine = {
    'n_estimators': [6, 8, 10, 12, 14],
    'criterion'   : ['gini', 'entropy'],
    'max_depth'   : [2, 3, 4, 5],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

In [96]:
# Create a based model
rfc_fine = RandomForestClassifier(bootstrap=False, max_features=None, n_jobs=-1)

In [98]:
# Instantiate the grid search model
grid_search_fine = GridSearchCV(
    estimator = rfc_fine,
    param_grid = param_grid_fine,
    cv = 3,
    n_jobs = -1,
    verbose = 2
)

In [99]:
# Fit the grid search to the data
grid_search_fine.fit(X_trn, y_trn)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   27.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=False,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                      

In [101]:
grid_search_fine.best_params_

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 3,
 'n_estimators': 6}

In [104]:
rfc_final = RandomForestClassifier(
    n_estimators=6,
    criterion='entropy',
    max_depth=3,
    max_features=None,
    bootstrap=False,
#     oob_score=True,
    n_jobs=-1,
    random_state=24,
    verbose=2,
    class_weight=None
)

In [105]:
rfc_final.fit(X_trn, y_trn)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


building tree 1 of 6
building tree 2 of 6
building tree 3 of 6building tree 4 of 6
building tree 5 of 6building tree 6 of 6




RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=6, n_jobs=-1,
                       oob_score=False, random_state=24, verbose=2,
                       warm_start=False)

In [106]:
rfc_final.feature_importances_

array([9.84534785e-01, 4.18999518e-03, 8.91985767e-04, 0.00000000e+00,
       1.03832342e-02])

In [109]:
rfc_final.oob_score

False

In [111]:
predict = rfc_final.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0819_RF_top20_005_v3.csv', encoding='utf-8', index=False)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.0s finished


Counter({0: 44966, 1: 53})
New result!


**Result:** `53  0.612244897959184` 

## 18 first of the top20 $\chi^2$

In [112]:
columns18 = [
    'AmountPositive',                                                           #01
    'Value',                                                                    #02
    'account_product_transactions__AmountPositive_global_sum',                  #03
    'account_provider_transactions__AmountPositive_global_avg',                 #04
    'account_product_category_transactions__AmountPositive_global_sum',         #05
    'account_provider_transactions__Value_global_avg',                          #06
    'account_provider_transactions__AmountPositive_global_sum',                 #07
    'account_channel_transactions__AmountPositive_global_sum',                  #08
    'account_product_transactions__AmountPositive_global_avg',                  #09
    'account_transactions__AmountPositive_global_sum',                          #10
    'account_pricing_strategy_transactions__AmountPositive_global_sum',         #11
    'account_product_category_transactions__AmountPositive_global_avg',         #12
    'account_product_transactions__AmountPositive_week_sum',                    #13
    'account_provider_transactions__AmountPositive_week_avg',                   #14
    'account_pricing_strategy_transactions__AmountPositive_global_avg',         #15
    'account_product_transactions__Value_global_avg',                           #16
    'account_product_category_transactions__Value_global_avg',                  #17
    'account_channel_transactions__AmountPositive_global_avg',                  #18
#     'account_transactions__AmountPositive_global_avg',                          #19
#     'account_provider_transactions__AmountPositive_week_sum'                    #20
]

In [113]:
X_trn = df_trn[columns18]
y_trn = df_trn['FraudResult']

X_tst = df_tst[columns18]

In [114]:
X_trn.head()

Unnamed: 0,AmountPositive,Value,account_product_transactions__AmountPositive_global_sum,account_provider_transactions__AmountPositive_global_avg,account_product_category_transactions__AmountPositive_global_sum,account_provider_transactions__Value_global_avg,account_provider_transactions__AmountPositive_global_sum,account_channel_transactions__AmountPositive_global_sum,account_product_transactions__AmountPositive_global_avg,account_transactions__AmountPositive_global_sum,account_pricing_strategy_transactions__AmountPositive_global_sum,account_product_category_transactions__AmountPositive_global_avg,account_product_transactions__AmountPositive_week_sum,account_provider_transactions__AmountPositive_week_avg,account_pricing_strategy_transactions__AmountPositive_global_avg,account_product_transactions__Value_global_avg,account_product_category_transactions__Value_global_avg,account_channel_transactions__AmountPositive_global_avg
0,1000.0,1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,500.0,500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20000.0,21800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,644,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,0.0


In [115]:
dtc = dtc_fit()
feature_importances_print(dtc)
dtc_to_pdf(dtc, 'DecisionTree-top20chi2-018-features')

Feature ranking:
  1.                          Value: 0.8150912646
  2.                 AmountPositive: 0.0871761807
  3. account_product_transactions__AmountPositive_global_sum: 0.0194949620
  4. account_provider_transactions__AmountPositive_global_sum: 0.0149319136
  5. account_pricing_strategy_transactions__AmountPositive_global_avg: 0.0145743829
  6. account_channel_transactions__AmountPositive_global_sum: 0.0113076112
  7. account_pricing_strategy_transactions__AmountPositive_global_sum: 0.0090283276
  8. account_provider_transactions__Value_global_avg: 0.0088210358
  9. account_transactions__AmountPositive_global_sum: 0.0067833735
 10. account_product_category_transactions__AmountPositive_global_avg: 0.0039824583
 11. account_product_category_transactions__AmountPositive_global_sum: 0.0037278453
 12. account_product_transactions__Value_global_avg: 0.0017027737
 13. account_product_transactions__AmountPositive_global_avg: 0.0013848387
 14. account_provider_transactions__AmountPosi

In [116]:
predict = dtc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_DT_top20chi2_018.csv', encoding='utf-8', index=False)

Counter({0: 44942, 1: 77})
New result!


**Result:** `77  0.786885245901639`

### Применим Random Forest

In [117]:
rfc = RandomForestClassifier(n_estimators=10, max_depth=5, n_jobs=-1, random_state=24)

In [118]:
rfc.fit(X_trn, y_trn)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=24, verbose=0,
                       warm_start=False)

In [119]:
predict = rfc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0816_RF_top20_018_v2.csv', encoding='utf-8', index=False)

Counter({0: 44947, 1: 72})
New result!


**Result:** `72   0.793103448275862`

### другие настройки Random Forest

In [120]:
rfc = RandomForestClassifier(
    n_estimators=200,                  # default=10
    criterion='entropy',               # default='gini'
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,                 # default=”auto”
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
#     min_impurity_split=1e-7,
    bootstrap=True,
    oob_score=True,                    # default=False
    n_jobs=-1,                         # default=None    
    random_state=24,                   # default=None
    verbose=0,
    warm_start=False,
    class_weight=None                  # 'balanced_subsample'
)

In [121]:
rfc.fit(X_trn, y_trn)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=True, random_state=24, verbose=0,
                       warm_start=False)

In [122]:
rfc.feature_importances_

array([0.07850147, 0.80475369, 0.00586228, 0.01236296, 0.00483298,
       0.01636808, 0.00724046, 0.00457732, 0.00257436, 0.00484795,
       0.00875807, 0.00238348, 0.00087423, 0.0013658 , 0.0306087 ,
       0.0034711 , 0.00228491, 0.00833216])

In [123]:
rfc.oob_score_

0.9996445819656709

In [124]:
predict = rfc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0819_RF_top20_018_v3.csv', encoding='utf-8', index=False)

Counter({0: 44944, 1: 75})
It is the same as in: AlBo0726_top13chi2_BaggingClassifier.csv


**Result:** `75  0.813559322033898`

**Пробуем `class_weight`**

In [126]:
rfc = RandomForestClassifier(
    n_jobs=-1,
    random_state=24,
    class_weight={0: 1, 1: 50}
)

In [127]:
rfc.fit(X_trn, y_trn)



RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 50},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=-1, oob_score=False,
                       random_state=24, verbose=0, warm_start=False)

In [130]:
predict = rfc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0819_RF_top20_018_v4.csv', encoding='utf-8', index=False)

Counter({0: 44951, 1: 68})
New result!


**Result:** `68  0.766666666666667`

In [131]:
rfc = RandomForestClassifier(
    n_estimators=200,                  # default=10
    criterion='entropy',               # default='gini'
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,                 # default=”auto”
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
#     min_impurity_split=1e-7,
    bootstrap=True,
    oob_score=True,                    # default=False
    n_jobs=-1,                         # default=None    
    random_state=24,                   # default=None
    verbose=0,
    warm_start=False,
    class_weight={0: 1, 1: 50}         # default=None
)

In [132]:
rfc.fit(X_trn, y_trn)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 50},
                       criterion='entropy', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=200, n_jobs=-1, oob_score=True,
                       random_state=24, verbose=0, warm_start=False)

In [133]:
rfc.oob_score_

0.9994982333632999

In [134]:
predict = rfc.predict(X_tst)
df_sbm['FraudResult'] = predict
print(Counter(df_sbm['FraudResult']))

if not is_predict_exist(df_sbm):
    print('New result!')
    df_sbm.to_csv('../submitted/AlBo0819_RF_top20_018_v5.csv', encoding='utf-8', index=False)

Counter({0: 44942, 1: 77})
New result!
