# Library

In [2]:
import os
import json
import warnings
warnings.filterwarnings('ignore')
import datetime
import pandas as pd

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import numpy as np
import pickle
from sklearn.ensemble import IsolationForest
from numpy import array

# Configs

In [37]:
path_main = "data/ml3yil.csv"
path_franchise = "data/BayiIlkVeriGoderimTarihi.xlsx"

train_start = "2016-06-01"
train_end = "2019-12-31"
test_start = "2020-01-01"
test_end = "2020-03-19"

# Functions

In [38]:
######################################################################################################################
######################################################################################################################
#READ DATA
def read_csv_excel(path):
    if path.endswith('.csv'):
        data = pd.read_csv(path, error_bad_lines=False,encoding = "ISO-8859-1",sep = ';', engine='python')
        if (data.empty):
            print ('CSV file is empty')
        else:
            data['Tarih'] = data['Tarih'].apply(lambda x: convert_str_to_date(x))
    else:
            data = pd.read_excel(path).rename(columns={'ilkverigonderimtarihi': 'd_r_start_date'})
    return data

def delete_nan_franchise(data):
    nan = data.query("d_r_start_date == 'nan'")
    del_bayi = nan["BayiKodu"].to_list()
    for i in del_bayi:
        data = data[data.BayiKodu != i]
        #print(i,"Deleted")
    return data,del_bayi
        
def delete_nan_main(data,del_bayi):
    for i in del_bayi:
        data = data[data.BayiKodu != i]
        #print(i,"Deleted")
    return data

def merge_bayi_franchise(data,franchise_Iot_starts):
    data = pd.merge(data, franchise_Iot_starts[['BayiKodu', 'd_r_start_date']], on='BayiKodu', how='left')
    data = data.query("d_r_start_date <= Tarih")
    data = data[['BayiKodu','Tarih','KisiSayimi','BayiTipi','BAyiTuru','City','District','gpslocation','d_r_start_date','KS']].reset_index(drop=True)
    data = data.sort_values(['BayiKodu', 'Tarih'], ascending=[True, True]).reset_index(drop=True)
    
    #data['weekday'] = data['Tarih'].apply(lambda x: x.weekday()) #0-6
    data['isoweekday'] = data['Tarih'].apply(lambda x: x.isoweekday()) #1-7 
    data['week_parts'] = data['isoweekday'].apply(lambda x: 1 if x in [1, 2, 3, 4, 5] else 0)
    # Feature engineering with the date
    data['year']=data.Tarih.dt.year 
    data['month']=data.Tarih.dt.month 
    data['day']=data.Tarih.dt.day
    return data

def convert_str_to_date(row):
    if '-' in list(row):
        return datetime.datetime.strptime(str(row)[0:10] , '%Y-%m-%d')
    else:
        row = '0' + row if len(row) == 9 else row
        return datetime.datetime.strptime("-".join([row[6:],row[3:5], row[0:2]]) , '%Y-%m-%d')
    
def split_dataset(data):
    data.set_index('Tarih',inplace=True)
    train_df, test_df = data[train_start:train_end],data[test_start:test_end]
    train_df = train_df.query("KisiSayimi >50 and KisiSayimi <2500")
    return train_df, test_df 

######################################################################################################################
######################################################################################################################    
#REMOVE EMPTY VENDORS
def pivot_table_for_missing(data):
    data = data.reset_index()
    data = data.pivot(index='Tarih', columns='BayiKodu', values='KisiSayimi')
    data = data[250:1288]
    return data


def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing_Values', 1 : 'Percent_Missing'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        'Percent_Missing', ascending=False).round(1)
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns



def handling_missing_values(missing_values,train_df):
    result=missing_values.query("Missing_Values > 694").reset_index()
    del_bayi = result["BayiKodu"].to_list()
    #First data
    for i in del_bayi:
        train_df = train_df[train_df.BayiKodu != i]
    return train_df,del_bayi

######################################################################################################################
######################################################################################################################
#TRAIN DATASETS
def get_bayi_type_1(row):
    if row['BayiTuru'] in ['TTM', 'TTM_Sube']:
        bayi_tipi = row['BayiTipi'] if row['BayiTipi'] == 'Cadde' else 'AVM'
        return 'ttt_and_ttm_' + bayi_tipi
    else:
        return row['BayiTuru']



def ttt_and_ttm_sube_cadde_weekday_train(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube' ").reset_index(drop=True)
    data = data.query("BayiTipi =='Cadde' and week_parts==1").reset_index(drop=True)  
    return data

def ttt_and_ttm_sube_cadde_weekend_train(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube'").reset_index(drop=True)
    data = data.query("BayiTipi =='Cadde' and week_parts==0").reset_index(drop=True)
    return data

def avm_weekday_train(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube'").reset_index(drop=True)
    data = data.query("BayiTipi =='AVM' or BayiTipi =='Avm' and week_parts==1").reset_index(drop=True)
    data = data.query("week_parts==1")
    return data

def avm_weekend_train(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube'").reset_index(drop=True)
    data = data.query("BayiTipi =='AVM' or BayiTipi =='Avm'").reset_index(drop=True)
    data = data.query("week_parts==0")
    return data

def tt_il_hq_weekday_train(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TT_IL_HQ' and week_parts==1").reset_index(drop=True)
    return data

def tt_il_hq_weekend_train(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TT_IL_HQ' and week_parts==0").reset_index(drop=True)
    return data

######################################################################################################################
######################################################################################################################
#TEST DATASETS
def ttt_and_ttm_sube_cadde_weekday_test(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube'").reset_index(drop=True)
    data = data.query("BayiTipi =='Cadde' and week_parts==1").reset_index(drop=True)
    return data

def ttt_and_ttm_sube_cadde_weekend_test(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube'").reset_index(drop=True)
    data = data.query("BayiTipi =='Cadde' and week_parts==0").reset_index(drop=True)
    return data

def avm_weekday_test(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube' ").reset_index(drop=True)
    data = data.query("BayiTipi =='AVM' or BayiTipi =='Avm'").reset_index(drop=True)
    data = data.query("week_parts==1")
    return data

def avm_weekend_test(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TTM' or BAyiTuru =='TTM_Sube' ").reset_index(drop=True)
    data = data.query("BayiTipi =='AVM' or BayiTipi =='Avm'").reset_index(drop=True)
    data = data.query("week_parts==0")
    return data

def tt_il_hq_weekday_test(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TT_IL_HQ' and week_parts==1").reset_index(drop=True)
    return data

def tt_il_hq_weekend_test(data):
    data = data.reset_index()
    data = data.query("BAyiTuru =='TT_IL_HQ' and week_parts==0").reset_index(drop=True)
    return data

######################################################################################################################
######################################################################################################################
#LINEAR IMPUTATION FOR TRAIN DATA
def fill_linear_imputation_train(data):
    data = data.interpolate(method='linear', axis=0).ffill().bfill()
    return data

######################################################################################################################
######################################################################################################################
#FILL NAN FOR TEST DATA
def fill_nan_test(data):
    data = data.fillna(0)
    return data


# data prep

In [39]:
######################################################################################################################
######################################################################################################################

data_1 = read_csv_excel(path_main)

data_2 = read_csv_excel(path_franchise)

data_franch,del_bayi = delete_nan_franchise(data_2)

data_main = delete_nan_main(data_1,del_bayi)

df_new = merge_bayi_franchise(data_main,data_franch)

######################################################################################################################
######################################################################################################################

train_df, test_df = split_dataset(df_new)

######################################################################################################################
######################################################################################################################

df = pivot_table_for_missing(train_df)

missing_values = missing_values_table(df)

train_df,del_bayi = handling_missing_values(missing_values,train_df)

######################################################################################################################
######################################################################################################################

ttm_df_cadde_train_weekday = ttt_and_ttm_sube_cadde_weekday_train(train_df)

ttt_and_ttm_sube_cadde_weekend = ttt_and_ttm_sube_cadde_weekend_train(train_df)

ttm_df_avm_train_weekday = avm_weekday_train(train_df)

ttm_df_avm_train_weekend = avm_weekend_train(train_df)

tt_il_hq_df_train_weekday = tt_il_hq_weekday_train(train_df)

tt_il_hq_df_train_weekend = tt_il_hq_weekend_train(train_df)

######################################################################################################################
######################################################################################################################

ttm_df_cadde_test_weekday = ttt_and_ttm_sube_cadde_weekday_test(test_df)

ttm_df_cadde_test_weekend = ttt_and_ttm_sube_cadde_weekend_test(test_df)

ttm_df_avm_test_weekday = avm_weekday_test(test_df)

ttm_df_avm_test_weekend = avm_weekend_test(test_df)

tt_il_hq_df_test_weekday = tt_il_hq_weekday_test(test_df)

tt_il_hq_df_test_weekend = tt_il_hq_weekend_test(test_df)

######################################################################################################################
######################################################################################################################

ttm_df_cadde_train_weekday = fill_linear_imputation_train(ttm_df_cadde_train_weekday)

ttt_and_ttm_sube_cadde_weekend = fill_linear_imputation_train(ttt_and_ttm_sube_cadde_weekend)

ttm_df_avm_train_weekday = fill_linear_imputation_train(ttm_df_avm_train_weekday)

ttm_df_avm_train_weekend = fill_linear_imputation_train(ttm_df_avm_train_weekend)

tt_il_hq_df_train_weekday = fill_linear_imputation_train(tt_il_hq_df_train_weekday)

tt_il_hq_df_train_weekend = fill_linear_imputation_train(tt_il_hq_df_train_weekend)

######################################################################################################################
######################################################################################################################

ttm_df_cadde_test_weekday = fill_nan_test(ttm_df_cadde_test_weekday)

ttm_df_cadde_test_weekend = fill_nan_test(ttm_df_cadde_test_weekend)

ttm_df_avm_test_weekday = fill_nan_test(ttm_df_avm_test_weekday)

ttm_df_avm_test_weekend = fill_nan_test(ttm_df_avm_test_weekend)

tt_il_hq_df_test_weekday = fill_nan_test(tt_il_hq_df_test_weekday)

tt_il_hq_df_test_weekend = fill_nan_test(tt_il_hq_df_test_weekend)

# train.py

In [40]:
def iso_forest_train(df, num_of_trees,contamination,args):
    
    df.set_index('Tarih',inplace=True)
    train = df[['KisiSayimi']].values
    
    it_model = IsolationForest(n_estimators=num_of_trees, max_samples='auto', contamination=contamination, \
                bootstrap=False, n_jobs=-1, random_state=42, verbose=1)
    
    it_model.fit(train)
    
    filename = 'models_week/'+args+'_iso_forest.sav'
    pickle.dump(it_model, open(filename,'wb'))

In [41]:
iso_forest_train(ttm_df_cadde_train_weekday, num_of_trees = 100, contamination=0.1,args="ttm_df_cadde_train_weekday")

iso_forest_train(ttt_and_ttm_sube_cadde_weekend, num_of_trees = 100, contamination=0.1,args="ttm_df_cadde_train_weekend")

iso_forest_train(ttm_df_avm_train_weekday, num_of_trees = 100, contamination=0.1,args="ttm_df_avm_train_weekday")

iso_forest_train(ttm_df_avm_train_weekend, num_of_trees = 100, contamination=0.1,args="ttm_df_avm_train_weekend")

iso_forest_train(tt_il_hq_df_train_weekday, num_of_trees = 100, contamination=0.1,args="TT_IL_HQ_df_train_weekday")

iso_forest_train(tt_il_hq_df_train_weekend, num_of_trees = 100, contamination=0.1,args="TT_IL_HQ_df_train_weekend")

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


# predict.py

In [42]:
def iso_predict(test_df,args):
    
    filename = "models_week/"+args
    clf_load = pickle.load(open(filename, 'rb'))
    
    test = test_df[['KisiSayimi']].values
    
    pred = clf_load.predict(test)
    scores = clf_load.score_samples(test)
    
    decision_scores = clf_load.decision_function(test)
    original_paper_score = [-1*s + 0.5 for s in decision_scores]
    original_paper_score = array(original_paper_score)
    
    #score_scaler = MinMaxScaler(feature_range=(0, 1))
    #score_scaled = score_scaler.fit_transform(original_paper_score.reshape(-1, 1))
    
    test_df['anomaly'] = pred
    test_df['scores'] = scores
    test_df['dscores'] = decision_scores
    test_df['ops'] = original_paper_score
    
    processed=0
    processed = processed + 1
    print("args",args,processed)
    
    print(test_df['anomaly'].value_counts())
    print("*" * 100)
   
    return test_df, scores, decision_scores, original_paper_score

In [43]:
result_df_ttm_cadde_weekday,scores,decision_scores,original_paper_score = iso_predict(ttm_df_cadde_test_weekday,args="ttm_df_cadde_train_weekday_iso_forest.sav")

result_df_ttm_cadde_weekend,scores,decision_scores,original_paper_score = iso_predict(ttm_df_cadde_test_weekend,args="ttm_df_cadde_train_weekend_iso_forest.sav")

ttm_df_avm_weekday,scores,decision_scores,original_paper_score = iso_predict(ttm_df_avm_test_weekday,args="ttm_df_avm_train_weekday_iso_forest.sav")

ttm_df_avm_weekend,scores,decision_scores,original_paper_score = iso_predict(ttm_df_avm_test_weekend,args="ttm_df_avm_train_weekend_iso_forest.sav")

tt_il_hq_df_test_weekday,scores,decision_scores,original_paper_score = iso_predict(tt_il_hq_df_test_weekday,args="TT_IL_HQ_df_train_weekday_iso_forest.sav")

tt_il_hq_df_test_weekend,scores,decision_scores,original_paper_score = iso_predict(tt_il_hq_df_test_weekend,args="TT_IL_HQ_df_train_weekend_iso_forest.sav")


args ttm_df_cadde_train_weekday_iso_forest.sav 1
 1    23338
-1     6721
Name: anomaly, dtype: int64
****************************************************************************************************
args ttm_df_cadde_train_weekend_iso_forest.sav 1
 1    9052
-1    2546
Name: anomaly, dtype: int64
****************************************************************************************************
args ttm_df_avm_train_weekday_iso_forest.sav 1
 1    4543
-1    1232
Name: anomaly, dtype: int64
****************************************************************************************************
args ttm_df_avm_train_weekend_iso_forest.sav 1
 1    1766
-1     462
Name: anomaly, dtype: int64
****************************************************************************************************
args TT_IL_HQ_df_train_weekday_iso_forest.sav 1
 1    6296
-1    2824
Name: anomaly, dtype: int64
****************************************************************************************************
arg