# Getting the lay of the land

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
import os
import random

In [2]:
"""
To run this notebook, you need to install a python package called `pandas-profiling`

You can do this using conda:
`conda install -c conda-forge pandas-profiling`
"""

'\nTo run this notebook, you need to install a python package called `pandas-profiling`\n\nYou can do this using conda:\n`conda install -c conda-forge pandas-profiling`\n'

In [3]:
transactions_dataset_filepath = "ieee-fraud-detection/train_transaction.csv"
identity_dataset_filepath     = "ieee-fraud-detection/train_identity.csv"

In [4]:
print(f"Size of transactions dataset file ({transactions_dataset_filepath}) is {os.stat(transactions_dataset_filepath).st_size/1000} Kbytes")
print(f"Size of identity dataset file ({identity_dataset_filepath}) is {os.stat(identity_dataset_filepath).st_size/1000} Kbytes")

Size of transactions dataset file (ieee-fraud-detection/train_transaction.csv) is 683351.067 Kbytes
Size of identity dataset file (ieee-fraud-detection/train_identity.csv) is 26529.68 Kbytes


To start, we would like to know things about our datasets. Let's start to understand how many rows there are.

In [5]:
def count_lines_in_file(file_path: str) -> int:
    with open(file_path) as file_pointer:
        for i, l in enumerate(file_pointer):
            pass
    return i + 1

In [6]:
# We know that a csv file should always start with a header. Therefore, after counting the lines of a file, the number of lines of the csv is that value minus one
transactions_dataset_lines = count_lines_in_file(transactions_dataset_filepath)
identity_dataset_lines     = count_lines_in_file(identity_dataset_filepath)

In [7]:
print(f"The transactions dataset has {transactions_dataset_lines} lines.")
print(f"The identity dataset has {identity_dataset_lines} lines.")

The transations dataset has 590541 lines.
The identity dataset has 144234 lines.


With this insights, we already know that:
- There are a total of 144234 identities (entities) in the dataset
- There are 590541 of total transactions
- We can average the following: each entity did, on average, 590541/144234 = 4 transactions (well, this value really represents nothing. it is just an average)

Now that we know the lines, let's understand the fields for each of them.

In [8]:
def read_csv_with_sample(csv_path: str, sample_percentage: float, random_seed: int = None) -> 'pandas.DataFrame':
    assert sample_percentage >= 0 and sample_percentage <=1
    
    random.seed(random_seed)
    csv_rows = count_lines_in_file(csv_path) - 1
    to_read_rows = int(round(csv_rows*sample_percentage,0))

    rows_to_be_skipped = sorted(random.sample(range(1,csv_rows),csv_rows-to_read_rows-1))
    return pd.read_csv(csv_path, skiprows=rows_to_be_skipped)
    

In [9]:
transaction_p001 = read_csv_with_sample(transactions_dataset_filepath, 0.01)

In [10]:
transaction_p001.shape

(5906, 394)

In [11]:
transaction_p001.columns

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338',
       'V339'],
      dtype='object', length=394)

Well, there are a lot of variables!

Let's do a process to make us select just 25 features.
To do this, we are going to:
1. Choose a small percentage of the dataset to sample, like 5%
2. Train a Random Forest with this data
3. Get the top 25 important features
4. Go back to number one, and increase the percentage. The goal is to compare the stability of the top 25 features between models. We stop when the top 25 features start being stable

## Reducing the space

In [12]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from functools import reduce
from typing import List

In [13]:
def get_columns_index_containing_nan_values(df: 'pandas.DataFrame') -> List[str]:
    columns = df.columns.values.tolist()
    isna_columns = df.isna().any().reset_index()
    na_columns_index = np.argwhere(isna_columns[0].values).tolist()
    if len(na_columns_index) == 0:
        return []
    else:
        return reduce(
            lambda x,y: x+y, 
            na_columns_index
        )


def deal_with_missing_values(df: 'pandas.DataFrame', columns_index_containing_na: List[str]) -> 'pandas.DataFrame':
    for index_column_contains_nan in columns_index_containing_na:
        column_name = df.columns[index_column_contains_nan]
        if df[column_name].dtypes == "float64":
            df.loc[:,column_name] = df[column_name].apply(lambda value: -50000 if pd.isna(value) else value)
        elif df[column_name].dtypes == "object":
            df.loc[:,column_name] = df[column_name].apply(lambda value: "NaN" if pd.isna(value) else value)
        else:
            print(columns[index_column_contains_nan])
            print(df[columns[index_column_contains_nan]].dtypes)
            raise Exception("There is one case of missing values that we are not dealing with")
    return df


def get_categoric_columns(df: 'pandas.DataFrame') -> List[str]:
    categoric_columns = []
    for column_name in df.columns:
        if df[column_name].dtypes == "object":
            categoric_columns.append(column_name)
    return categoric_columns


def create_onehotencoded_dataframe(df, categoric_columns):
    encoder_onehot = OneHotEncoder(handle_unknown='ignore')
    encoder_onehot.fit(df[categoric_columns])
    one_hot_encoded_current_column = encoder_onehot.transform(df[categoric_columns])
    one_hot_encoded_df = pd.DataFrame.sparse.from_spmatrix(one_hot_encoded_current_column, columns=encoder_onehot.get_feature_names(categoric_columns))
    return (pd.concat(
                [
                    df.drop(categoric_columns, axis=1), 
                    one_hot_encoded_df
                ], 
                axis=1, 
                sort=False
            ),
            encoder_onehot.get_feature_names(categoric_columns)
            )


def create_dataframe_with_onehotencoded_categories(df: 'pandas.DataFrame') -> 'pandas.Dataframe':
    isna_columns_list = get_columns_index_containing_nan_values(df)
    if len(isna_columns_list) > 0:
        df = deal_with_missing_values(df, isna_columns_list)
    categoric_variables = get_categoric_columns(df)
    return create_onehotencoded_dataframe(df, categoric_variables)
    
    

In [21]:
transaction_p001_onencoded, encoded_columns = create_dataframe_with_onehotencoded_categories(transaction_p001)
transaction_p001_onencoded.shape

(5906, 510)

In [22]:
Y = transaction_p001_onencoded.isFraud
X = transaction_p001_onencoded.loc[:,transaction_p001_onencoded.columns != "isFraud"]

In [23]:
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)
forest.fit(X,Y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=250,
                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                     warm_start=False)

In [24]:
importances = forest.feature_importances_

In [25]:
X.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,M6_T,M7_F,M7_NaN,M7_T,M8_F,M8_NaN,M8_T,M9_F,M9_NaN,M9_T
0,2987072,87752,6.767,13832,375.0,185.0,224.0,-50000.0,-50000.0,-50000.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,2987092,88054,117.0,6481,111.0,150.0,226.0,337.0,87.0,327.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2987230,90084,107.95,6170,174.0,150.0,226.0,315.0,87.0,14.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,2987538,95817,209.95,7482,490.0,150.0,226.0,325.0,87.0,6.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,2987887,107595,50.149,4461,375.0,185.0,224.0,-50000.0,-50000.0,-50000.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [26]:
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
acc_importance = 0
for f in range(X.shape[1]):
    acc_importance += importances[indices[f]]
    print(f"{f+1}. Feature ({indices[f]},{X.columns[indices[f]]}) ({importances[indices[f]]}) - accumulated_importance is {round(acc_importance,3)}")

Feature ranking:
1. Feature (4,card2) (0.022650739787605536) - accumulated_importance is 0.023
2. Feature (2,TransactionAmt) (0.020804143913682654) - accumulated_importance is 0.043
3. Feature (1,TransactionDT) (0.019664040050964577) - accumulated_importance is 0.063
4. Feature (0,TransactionID) (0.01963973076013291) - accumulated_importance is 0.083
5. Feature (3,card1) (0.018460803216801526) - accumulated_importance is 0.101
6. Feature (7,addr1) (0.014903882721325286) - accumulated_importance is 0.116
7. Feature (6,card5) (0.012593470511317581) - accumulated_importance is 0.129
8. Feature (24,C14) (0.010886801875898418) - accumulated_importance is 0.14
9. Feature (23,C13) (0.010808917380364543) - accumulated_importance is 0.15
10. Feature (11,C1) (0.010723804536919453) - accumulated_importance is 0.161
11. Feature (21,C11) (0.008900419790240111) - accumulated_importance is 0.17
12. Feature (12,C2) (0.008849116123172147) - accumulated_importance is 0.179
13. Feature (390,card6_credit)

We can see that with around 200 features, we have an accumulated importance of 80%. This means that with less than half of the features, we get around 80% of the information between features and label.

Given we have one hot encoded categories, we need to understand what source columns correspond to these 200 features (some with one hot enconded values)

Let's create a map between the Source Column to the One Hot Encoded Column

In [27]:
X.shape[1]

509

And let's see the feature importance with source variables (some of them will appear repeated)

In [46]:
print("Feature ranking:")
acc_importance = 0
for f in range(X.shape[1]):
    acc_importance += importances[indices[f]]
    column_to_show = map_one_hot_encoded_column_to_source_column(X.columns[indices], X) if X.columns[indices[f]] in encoded_columns else X.columns[indices[f]]
    print(f"{f+1}. Feature ({indices[f]},{column_to_show}) ({importances[indices[f]]}) - accumulated_importance is {round(acc_importance,3)}")

Feature ranking:
1. Feature (4,card2) (0.022650739787605536) - accumulated_importance is 0.023
2. Feature (2,TransactionAmt) (0.020804143913682654) - accumulated_importance is 0.043
3. Feature (1,TransactionDT) (0.019664040050964577) - accumulated_importance is 0.063
4. Feature (0,TransactionID) (0.01963973076013291) - accumulated_importance is 0.083
5. Feature (3,card1) (0.018460803216801526) - accumulated_importance is 0.101
6. Feature (7,addr1) (0.014903882721325286) - accumulated_importance is 0.116
7. Feature (6,card5) (0.012593470511317581) - accumulated_importance is 0.129
8. Feature (24,C14) (0.010886801875898418) - accumulated_importance is 0.14
9. Feature (23,C13) (0.010808917380364543) - accumulated_importance is 0.15
10. Feature (11,C1) (0.010723804536919453) - accumulated_importance is 0.161
11. Feature (21,C11) (0.008900419790240111) - accumulated_importance is 0.17
12. Feature (12,C2) (0.008849116123172147) - accumulated_importance is 0.179
c_a


AssertionError: 

Now let's map the importance for each source columns

In [34]:
map_columns_accumulated_importance = {}

for f in range(X.shape[1]):
    print(X.columns[indices[f]])
    acc_importance += importances[indices[f]]
    source_column = map_one_hot_encoded_column_to_source_column(X.columns[indices[f]], X) if X.columns[indices[f]] in encoded_columns else X.columns[indices[f]]
    if source_column in map_columns_accumulated_importance:
        map_columns_accumulated_importance[source_column] += importances[indices[f]]
    else:
        map_columns_accumulated_importance[source_column] = importances[indices[f]]

# sort by importance
map_columns_accumulated_importance_sorted = {k: v for k, v in sorted(map_columns_accumulated_importance.items(), key=lambda item: -item[1])}


card2
TransactionAmt
TransactionDT
TransactionID
card1
addr1
card5
C14
C13
C1
C11
C2
card6_credit


AssertionError: 

In [36]:
# sort by importance
map_columns_accumulated_importance_sorted = {k: v for k, v in sorted(map_columns_accumulated_importance.items(), key=lambda item: -item[1])}


In [37]:
map_columns_accumulated_importance_sorted

{'card2': 0.022650739787605536,
 'TransactionAmt': 0.020804143913682654,
 'TransactionDT': 0.019664040050964577,
 'TransactionID': 0.01963973076013291,
 'card1': 0.018460803216801526,
 'addr1': 0.014903882721325286,
 'card5': 0.012593470511317581,
 'C14': 0.010886801875898418,
 'C13': 0.010808917380364543,
 'C1': 0.010723804536919453,
 'C11': 0.008900419790240111,
 'C2': 0.008849116123172147}

Now let's create a function that does this for us

In [45]:
def map_one_hot_encoded_column_to_source_column(encoded_columns: List[str], df: 'pandas.DataFrame'):
    one_hot_encoded_column_to_source_column = {}

    for column in encoded_columns:
        splitter = column.split("_")
        prefix = splitter[0]
        if not prefix in df.columns:
            splitter = column.split("_")
            for i in range(0, len(splitter)):
                if (i == 0):
                    prefix = encoded_columns[i]
                else:
                    prefix += encoded_columns[i]
            prefix = prefix[0] + "_" + prefix[1]
            print(prefix)
            assert prefix in df.columns
            one_hot_encoded_column_to_source_column[column] = prefix
    return one_hot_encoded_column_to_source_column


def compute_cumulative_importance_of_source_columns(train_df, model_importances, importance_indices, one_hot_encoded_column_to_source_column):
    map_columns_accumulated_importance = {}
    for f in range(X.shape[1]):
        source_column = one_hot_encoded_column_to_source_column[X.columns[importance_indices[f]]] if X.columns[importance_indices[f]] in encoded_columns else X.columns[importance_indices[f]]
        if source_column in map_columns_accumulated_importance:
            map_columns_accumulated_importance[source_column] += model_importances[importance_indices[f]]
        else:
            map_columns_accumulated_importance[source_column] = model_importances[importance_indices[f]]

    # sort by importance
    return {k: v for k, v in sorted(map_columns_accumulated_importance.items(), key=lambda item: -item[1])}

def get_source_columns_importance_from_model(random_forest: 'ExtraTreesClassifier', df: 'pandas.DataFrame', train_df: 'pandas.DataFrame', encoded_columns: List[str]):
    importances = random_forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    one_hot_encoded_column_to_source_column = map_one_hot_encoded_column_to_source_column(encoded_columns, df)
    return compute_cumulative_importance_of_source_columns(train_df,
                                                           importances,
                                                           indices,
                                                           one_hot_encoded_column_to_source_column)
    


In [38]:
feature_importances = get_source_columns_importance_from_model(forest, transaction_p001, X,  encoded_columns)

AssertionError: 

In [39]:
feature_importances

NameError: name 'feature_importances' is not defined

In [40]:
#sample 5%

transaction_sample = read_csv_with_sample(transactions_dataset_filepath, 0.05)

In [41]:
transaction_sample.shape

(29528, 394)

In [42]:
transaction_sample_onencoded, sample_encoded_columns = create_dataframe_with_onehotencoded_categories(transaction_sample)

In [43]:
transaction_sample_onencoded.shape

(29528, 534)

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
sample_Y = transaction_sample_onencoded.isFraud
sample_X = transaction_sample_onencoded.loc[:,transaction_sample_onencoded.columns != "isFraud"]

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,M6_T,M7_F,M7_NaN,M7_T,M8_F,M8_NaN,M8_T,M9_F,M9_NaN,M9_T
0,2987015,86618,57.95,7055,555.0,150.0,226.0,315.0,87.0,3.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,2987030,86994,35.0,13276,555.0,150.0,226.0,126.0,87.0,-50000.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2987048,87317,42.294,15885,545.0,185.0,138.0,-50000.0,-50000.0,-50000.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2987057,87445,50.0,11839,490.0,150.0,226.0,204.0,87.0,-50000.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,2987079,87839,28.699,4504,500.0,185.0,219.0,-50000.0,-50000.0,-50000.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [50]:
randomf = RandomForestClassifier(n_estimators=250)
randomf.fit(sample_X, sample_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [51]:
randomf_importances = randomf.feature_importances_

In [56]:
randomf_indices = np.argsort(randomf_importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
accum_importance = 0
for f in range(sample_X.shape[1]):
    accum_importance += randomf_importances[randomf_indices[f]]
    printf()
    print(f"{f+1}. Feature ({randomf_indices[f]},{sample_X.columns[randomf_indices[f]]}) ({randomf_importances[randomf_indices[f]]}) - accumulated_importance is {round(accum_importance,3)}")

Feature ranking:
1. Feature (0,TransactionID) (0.025458871058729866) - accumulated_importance is 0.025
2. Feature (1,TransactionDT) (0.024482412093890665) - accumulated_importance is 0.05
3. Feature (3,card1) (0.021586621450711125) - accumulated_importance is 0.072
4. Feature (2,TransactionAmt) (0.021540775003962374) - accumulated_importance is 0.093
5. Feature (4,card2) (0.019857074380905664) - accumulated_importance is 0.113
6. Feature (7,addr1) (0.014879017939064992) - accumulated_importance is 0.128
7. Feature (23,C13) (0.014385409522313696) - accumulated_importance is 0.142
8. Feature (84,V45) (0.013511184428489404) - accumulated_importance is 0.156
9. Feature (297,V258) (0.01307707873181319) - accumulated_importance is 0.169
10. Feature (11,C1) (0.012109177875363572) - accumulated_importance is 0.181
11. Feature (6,card5) (0.011238746112086048) - accumulated_importance is 0.192
12. Feature (24,C14) (0.010638243150728319) - accumulated_importance is 0.203
13. Feature (22,C12) (0.0

In [63]:
get_source_columns_importance_from_model(randomf, transaction_sample, sample_X, sample_encoded_columns)

IndexError: index 521 is out of bounds for axis 0 with size 518