In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sb

import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
import os

In [19]:
data_non_sensitive = pd.read_csv(r'C:\NICE Documents\Bank of Indonesia\agg_data_nonsensitive.csv')
data_sensitive = pd.read_csv(r'C:\NICE Documents\Bank of Indonesia\agg_data_sensitive.csv')

In [20]:
data_non_sensitive = data_non_sensitive.fillna(0)
data_sensitive = data_sensitive.fillna(0)

In [21]:
data_non_sensitive.rename(columns = {'#RIC': 'Currency_Pair'}, inplace=True)
data_sensitive.rename(columns = {'#RIC': 'Currency_Pair'}, inplace=True)

In [22]:
def train(x_train, contamination_ratio):
    df_stats = x_train
    df_anomalies = pd.DataFrame([])
    for currency_pair in list(x_train.Currency_Pair.unique()):
        df_stats_original = df_stats[df_stats.Currency_Pair == currency_pair]
        df_x = df_stats_original.drop(['Currency_Pair','Year','Month','Day','Hour'], axis=1)
        df_x_original = df_x.copy()
        row_count = df_x.shape[0]
        #get the estimators
        if row_count < 5:
            estimaters = 1
        elif row_count < 20:
            estimaters = 5
        elif row_count < 50:
            estimaters = 10
        elif row_count < 100:
            estimaters = 30
        elif row_count < 200:
            estimaters = 50
        elif row_count < 500:
            estimaters = 100
        else:
            estimaters = 200
        model = IsolationForest(n_estimators=200, contamination=1.0,  random_state=22)
        model.fit(df_x)
        #df_x['anomaly'] = pd.Series(model.predict(df_x_original)).values
        #df_x['anomaly'] = df_x['anomaly'].map( {1: 0, -1: 1} )
        #df_x_anomalous = df_x[df_x.anomaly==1].drop('anomaly',axis=1)
        n = df_x_original.shape[0]
        anomalous_data_index = pd.DataFrame(model.decision_function(df_x_original)).sort_values(by=0).index[0:n]
        df_stats_original = df_stats_original.assign(Anomaly_Score=pd.Series(model.decision_function(df_x_original)).values)
        df_anomalies_temp = df_stats_original.iloc[anomalous_data_index]
        df_anomalies = df_anomalies.append(df_anomalies_temp, sort=False, ignore_index=True)
    # Isolation forest returns negative score for anomalous records so converting them to positive.    
    df_anomalies['Anomaly_Score'] = df_anomalies.Anomaly_Score*-1
    #Get the feature importance
    df_anomaly_x = df_anomalies.drop(['Currency_Pair','Year','Month','Day', 'Hour', 'Anomaly_Score'], axis=1)
    df_x_output = pd.DataFrame(df_anomaly_x.values)
    f_list = list(df_anomaly_x.columns)
    df_feature_anomalies = pd.DataFrame([], columns=f_list)
    df_Currency_Pair_list = list(df_anomalies.Currency_Pair.unique())
    scaler = MinMaxScaler()
    df_in_anomalies = pd.DataFrame([])
    for Currency_Pair in df_Currency_Pair_list:
        df_anomalies_original = pd.DataFrame(df_anomalies[df_anomalies.Currency_Pair == Currency_Pair])
        df_x = df_anomalies_original.drop(['Currency_Pair','Year','Month','Day', 'Hour', 'Anomaly_Score'], axis=1)
        df_x = df_x.astype(np.float64)
        df_x_original = df_x.copy()
        scaled_values = scaler.fit_transform(df_x)
        df = pd.DataFrame(scaled_values, columns=f_list)
        df = df.subtract(df.mean(), axis=1)
        df = abs(df)
        df_feature_anomalies = pd.concat([df_feature_anomalies, df], axis=0, ignore_index=True, sort=False)
    df_feature_anomalies.columns = ['Score_' + col for col in df_feature_anomalies.columns]
    return df_anomalies, df_feature_anomalies


In [23]:
def pivot_and_format_data(df_anomalies, df_feature_anomalies, trade_timing):
    df_feature_anomalies['Year'] = df_anomalies.Year
    df_feature_anomalies['Month'] = df_anomalies.Month
    df_feature_anomalies['Day'] = df_anomalies.Day
    df_feature_anomalies['Hour'] = df_anomalies.Hour
    df_feature_anomalies['Currency_Pair'] = df_anomalies.Currency_Pair
    
    df_top5_score = df_anomalies.groupby('Currency_Pair')['Anomaly_Score'].nlargest(5).sum(level=0).reset_index()
    #df_top3_score = df_anomalies.groupby('interaction_from')['Anomaly_Score'].nlargest(3).sum(level=0).reset_index()
    df_top1_score = df_anomalies.groupby('Currency_Pair')['Anomaly_Score'].nlargest(1).sum(level=0).reset_index()
    # in case nlargest does not return n values, it will take the 1st largest.
    if df_top5_score.columns[0] == 'index':
        df_top_n_score = df_top1_score
    else:
        df_top_n_score = df_top5_score
    df_top_n_score.columns = ['Currency_Pair','Anomaly_Score_Agg']
    df_anomalies_final = df_anomalies.merge(df_top_n_score, how='left', on='Currency_Pair')
    anomaly_score = df_anomalies_final[['Currency_Pair','Year','Month', 'Day', 'Hour','Anomaly_Score', 'Anomaly_Score_Agg']]
    df_anomalies = df_anomalies.drop('Anomaly_Score',axis=1)
    
    #df_anomalies = df_anomalies.drop('Anomaly_Score',axis=1)
    df_anomalies_pivot = df_anomalies.melt(id_vars=["Currency_Pair",'Year','Month', "Day", 'Hour'], var_name="Features_for_Anomaly", value_name="Feature_Stats")
    df_feature_anomalies_pivot = df_feature_anomalies.melt(id_vars=["Currency_Pair",'Year','Month', "Day", 'Hour'], var_name="Reasons_for_Anomaly", value_name="Feature_Score")
    df_anomalies_pivot = df_anomalies_pivot.set_index(["Currency_Pair",'Year','Month', "Day", 'Hour', df_anomalies_pivot.groupby(["Currency_Pair",'Year','Month', "Day", 'Hour']).cumcount()])
    df_feature_anomalies_pivot = df_feature_anomalies_pivot.set_index(["Currency_Pair",'Year','Month', "Day", 'Hour', df_feature_anomalies_pivot.groupby(["Currency_Pair",'Year','Month', "Day", 'Hour']).cumcount()])
    df_anomaly_feature_merged_pivot = (pd.concat([df_anomalies_pivot, df_feature_anomalies_pivot],axis=1)
             .sort_index(level=2)
             .reset_index(level=5, drop=True)
             .reset_index())
    feature_stats_median = df_anomaly_feature_merged_pivot.groupby(['Currency_Pair','Features_for_Anomaly']).Feature_Stats.median().reset_index()
    feature_stats_median.columns = ['Currency_Pair','Features_for_Anomaly','Features_Stats_Meadian']
    anomaly_stats_feature_score = df_anomaly_feature_merged_pivot.merge(feature_stats_median, on = ['Currency_Pair','Features_for_Anomaly'], how='left')
    #anomaly_stats_feature_score = anomaly_stats_feature_score.assign(Algorithm='isolation-forest', duration_window = duration_window)
    anomaly_stats_feature_score.drop('Reasons_for_Anomaly', axis=1, inplace=True)
    anomaly_stats_feature_score = anomaly_stats_feature_score.round(3)
    anomaly_score = anomaly_score.round(3)
    anomaly_stats_feature_score['Trade_Timing'] = trade_timing
    anomaly_score['Trade_Timing'] = trade_timing
    return anomaly_score, anomaly_stats_feature_score

In [24]:
df_anomalies, df_feature_anomalies = train(data_non_sensitive, 1.0)
anomaly_score_1, anomaly_stats_feature_score_1 = pivot_and_format_data(df_anomalies, df_feature_anomalies, 'Normal Trading Time Periods')

  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))


In [25]:
df_anomalies, df_feature_anomalies = train(data_sensitive, 1.0)
anomaly_score_2, anomaly_stats_feature_score_2 = pivot_and_format_data(df_anomalies, df_feature_anomalies, 'Price Sensitive Trading Time Periods')

  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))
  * _average_path_length([self.max_samples_]))


In [26]:
anomaly_stats_feature_score = pd.concat([anomaly_stats_feature_score_1, anomaly_stats_feature_score_2], axis=0)

In [27]:
anomaly_score = pd.concat([anomaly_score_1, anomaly_score_2], axis=0)

In [28]:
data_non_sensitive['Trade_Timing'] = 'Normal Trading Time Periods'
data_sensitive['Trade_Timing'] = 'Price Sensitive Trading Time Periods'

In [29]:
data = pd.concat([data_non_sensitive, data_sensitive], axis=0)

In [30]:
anomaly_stats_feature_score.to_csv(r'C:\NICE Documents\Bank of Indonesia\anomaly_stats_feature_score.csv', index=False)

In [31]:
anomaly_score.to_csv(r'C:\NICE Documents\Bank of Indonesia\anomaly_score.csv', index=False)

In [32]:
data.round(3).to_csv(r'C:\NICE Documents\Bank of Indonesia\Feature_Stats.csv', index=False)

In [38]:
anomaly_stats_feature_score.dtypes

Currency_Pair              object
Year                        int64
Month                       int64
Day                         int64
Hour                        int64
Features_for_Anomaly       object
Feature_Stats             float64
Feature_Score             float64
Features_Stats_Meadian    float64
Trade_Timing               object
dtype: object

In [None]:
df_anomaly