In [1]:
#imports
import pandas as pd
from helper import pickle_store, pickle_restore
import dateutil
from adem import *
from collections import Counter 

#fetch all rawdata and save as pickle format for speed.
file_raw_df = pd.read_csv("r4.2/file.csv") # 2629979 rows, 11 columns
# Select small data till 1 April
pickle_store("pickle/file_rawdf_file", file_raw_df) #raw means not indexed by Timeindex

In [2]:
file_raw_df = pickle_restore("pickle/file_rawdf_file")

dev_act_feature_df=file_raw_df[['user', 'date', 'pc', 'filename']]


dev_act_feature_df['date'] = pd.to_datetime(dev_act_feature_df['date'])
# mask = dev_act_feature_df['date'] <= end_d
mask = (dev_act_feature_df['date'] <= end_d) & (dev_act_feature_df['date'] >= start_d)
dev_act_feature_df = dev_act_feature_df.loc[mask]





#create a copy of orig df so that I can separately process for ubp pbp and cbp
dev_act_feature_cmnty_df = dev_act_feature_df.copy()
dev_act_feature_peer_df = dev_act_feature_df.copy()

# 1. in email_freq_feature_df add community and peer column with id as values.
usr_cmnty_map = pickle_restore("pickle/community_louvian_file")
dev_act_feature_cmnty_df['cmnty']=dev_act_feature_cmnty_df.apply(lambda row: usr_cmnty_map[row.user], axis=1)

usr_peer_map = pickle_restore("pickle/eid_role_map_file")
dev_act_feature_peer_df['peer']=dev_act_feature_peer_df.apply(lambda row: usr_peer_map[row.user], axis=1)


In [4]:
# Also NOTE that unlike emails I have converted the dates to datetime format. 
# Convert date back to string format
dev_act_feature_df['date']=dev_act_feature_df['date'].astype(str)
dev_act_feature_cmnty_df['date']=dev_act_feature_cmnty_df['date'].astype(str)
dev_act_feature_peer_df['date']=dev_act_feature_peer_df['date'].astype(str)

# dev_act_feature_df.info()

In [5]:
# Now data is ready to generate prep_dic using groupby, UBP. Feature = num_activity
def prep_ubp_dic(df):
    #convert date column of srt type to Timestamp type.
    df['date'] = df['date'].apply(dateutil.parser.parse, dayfirst=False) #NOTE converts day to day first year-dd-mm
    #now convert date column of Timestamptype to only date values and remove hr:mm:ss
    df['date'] = pd.to_datetime(df['date']).dt.date

    #creates user, date wise groups and each internal group is a dataframe.
    # grp=tmp_email_freq_feature_df.groupby(['user', 'date'])

    # create user, date wise groups and count unique dates then add a separate column for the counts
    df=df.groupby(['user', 'date']).filename.agg('count').to_frame('num_filecopy').reset_index()

    #groupby user now To populate a dictionary for each user.
    grp=df.groupby(['user'])

    email_freq_feature_dic={} #user:df with feature value
    #iterating groups
    for name, group in grp:
        # print (name)
        # print (group)
        email_freq_feature_dic[name]=group
        # ldf = group.groupby(['date'], as_index=False)['cntr'].size()

    return email_freq_feature_dic


def prep_cbp_dic(df):
    #convert date column of srt type to Timestamp type.
    df['date'] = df['date'].apply(dateutil.parser.parse, dayfirst=False)
    #now convert date column of Timestamptype to only date values and remove hr:mm:ss
    df['date'] = pd.to_datetime(df['date']).dt.date

    # create user, date wise groups and count unique dates then add a separate column for the counts
    df=df.groupby(['cmnty', 'date']).filename.agg('count').to_frame('num_filecopy').reset_index()

    cmnty_len = Counter(usr_cmnty_map.values())
    #make values of email freq avg of community
    df['num_filecopy'] = df.apply( lambda x: x.num_filecopy/cmnty_len[x.cmnty], axis=1)



    #groupby user now To populate a dictionary for each user.
    grp=df.groupby(['cmnty'])

    feature_cmnty_dic={} #user:df with feature value
    #iterating groups
    for name, group in grp:
        # print (name)
        # print (group)
        feature_cmnty_dic[name]=group
        # ldf = group.groupby(['date'], as_index=False)['cntr'].size()



    return feature_cmnty_dic

def prep_pbp_dic(df):
    #convert date column of srt type to Timestamp type.
    df['date'] = df['date'].apply(dateutil.parser.parse, dayfirst=False)
    #now convert date column of Timestamptype to only date values and remove hr:mm:ss
    df['date'] = pd.to_datetime(df['date']).dt.date

    # create user, date wise groups and count unique dates then add a separate column for the counts
    df=df.groupby(['peer', 'date']).filename.agg('count').to_frame('num_filecopy').reset_index()

    cmnty_len = Counter(usr_peer_map.values())
    #make values of email freq avg of community
    df['num_filecopy'] = df.apply( lambda x: x.num_filecopy/cmnty_len[x.peer], axis=1)

    #groupby user now To populate a dictionary for each user.
    grp=df.groupby(['peer'])

    feature_peer_dic={} #user:df with feature value
    #iterating groups
    for name, group in grp:
        # print (name)
        # print (group)
        feature_peer_dic[name]=group
        # ldf = group.groupby(['date'], as_index=False)['cntr'].size()
    return feature_peer_dic



In [6]:
ubp_dev_act_feature_df = dev_act_feature_df.copy()
ubp_dev_act_feature_dic = prep_ubp_dic(ubp_dev_act_feature_df)


cbp_dev_act_feature_df = dev_act_feature_cmnty_df.copy()
cbp_dev_act_feature_dic = prep_cbp_dic(cbp_dev_act_feature_df)


pbp_dev_act_feature_df = dev_act_feature_peer_df.copy()
pbp_dev_act_feature_dic = prep_pbp_dic(pbp_dev_act_feature_df)



pickle_store("pickle/f10/f10_ubp_file", ubp_dev_act_feature_dic)
pickle_store("pickle/f10/f10_pbp_file", pbp_dev_act_feature_dic)
pickle_store("pickle/f10/f10_cbp_file", cbp_dev_act_feature_dic)



In [7]:
count = 0
feature_name = 'num_filecopy'
print('PBP')
for key in ubp_dev_act_feature_dic:
    usr=key #'AAM0658'
    debug_df = ubp_dev_act_feature_dic[usr]
    debug_peer_df = pbp_dev_act_feature_dic[usr_peer_map[usr]]

    #do a depp copy of preped feature dataframe
    df = debug_df.copy()
    df_peer = debug_peer_df.copy()
    
    uname = key
    print(key)
    anom_calc_pbp(df, df_peer, feature_name, uname, ws=10, sig=3)
    
    # count += 1
    # if count%10 == 0:
    #     break

print('CBP')
for key in ubp_dev_act_feature_dic:
    usr=key
    debug_df = ubp_dev_act_feature_dic[usr]
    debug_cmnty_df = cbp_dev_act_feature_dic[usr_cmnty_map[usr]]

    #do a depp copy of preped feature dataframe
    df = debug_df.copy()
    df_cmnty = debug_cmnty_df.copy()

    uname = key
    print(key)
    anom_calc_cbp(df, df_cmnty, feature_name, uname, ws=10, sig=3)
    
    # count += 1
    # if count%10 == 0:
    #     break


print('UBP')
# email_freq_feature_dic.keys()
# Feature data format:  user: df [user, date, featurecolmn]
for key in ubp_dev_act_feature_dic:
    usr=key
    debug_df = ubp_dev_act_feature_dic[usr]


    #do a depp copy of preped feature dataframe
    df = debug_df.copy()
    print(key)
    anom_calc_ubp(df, feature_name, usr, ws=10, sig=3)
    
    # count += 1
    # if count%10 == 0:
        # break

PBP
AAF0535
AAM0658
ABC0174
AHD0848
AHM0410
AIB0948
AIP0982
AJD0074
AJF0370
AJH0175
AJN0607
AJR0932
AKR0057
AMR0318
AOK0844
ATE0869
ATP0662
AVM0947
BAJ0654
BAL0044
BBG0325
BBS0039
BBS0422
BDI0533
BDV0168
BEH0615
BIH0745
BIS0247
BJM0111
BJP0134
BLS0678
BMG0917
BMT0528
BRB0355
BRM0995
BSS0369
BTL0226
BTR0403
BVC0790
BWP0202
CAA0612
CAB0614
CAH0936
CAM0681
CAS0507
CCA0046
CCL0068
CCM0136
CEJ0109
CFM0767
CGB0637
CHM0821
CIM0271
CLB0774
CPG0322
CQH0701
CQW0652
CSC0217
CTA0020
CWR0502
DAL0673
DBB0384
DFH0188
DIB0081
DIB0285
DIW0118
DLM0051
DMK0257
DPM0423
DRR0162
EDB0714
EGD0132
EHB0824
EHD0584
EIS0041
EMW0772
ESH0283
ESJ0670
ESR0693
FEB0306
FFC0891
FKK0055
FMG0527
FOB0756
FSC0601
FTM0406
GCL0016
GCS0571
GHH0288
GHL0460
GKO0078
GNS0178
GTD0219
GZC0735
HAD0246
HAH0760
HBO0413
HCL0651
HCM0267
HCS0003
HDH0928
HDS0367
HFB0347
HJB0742
HPH0075
HRB0351
HRL0540
HSB0196
HVF0067
HWW0436
HXL0968
IBB0359
IBB0696
IBM0671
IBS0836
ICH0294
IIW0249
IJM0444
IJM0776
IKP0472
IKR0401
ILH0958
ILJ0526
IRM0931
IUB0