In [1]:
## from raw email data prepare groupby email fre feature data.
# prepare user, community, and peer data dic
# for each calculate anomaly and plot

#imports
import pandas as pd
from helper import pickle_store, pickle_restore
import dateutil
from adem import *
from collections import Counter 

def fetch_raw_data():
    #fetch all rawdata and save as pickle format for speed.
    email_raw_df = pd.read_csv("r4.2/email.csv") # 2629979 rows, 11 columns

    # Select small data till 1 April
    # email_raw_df = email_raw_df[:515658]
    pickle_store("pickle/email_rawdf_file", email_raw_df) #raw means not indexed by Timeindex

# fetch_raw_data()

In [2]:
# feature1: email frequency. function will ingest raw data and return a dict ready to be fed to anomaly detection module
# returned diction will be of form: d= {user:df}. user activity would only be taken for weekdays.
# special cases: What if employee is on leave on some days.. then activity will 0.
# steps: 1. take raw data, reduce the data columns to activity of interest 2. groupby user-- each user will have its own df now.
# 3. groupby on activity to cal frequency

#get email data of first 3 months: data is without date column converted to timedates type
email_raw_df = pickle_restore("pickle/email_rawdf_file")

email_freq_feature_df=email_raw_df[['user', 'date', 'from']]

email_freq_feature_df['date'] = pd.to_datetime(email_freq_feature_df['date'])
mask = (email_freq_feature_df['date'] <= end_d) & (email_freq_feature_df['date'] >= start_d)
email_freq_feature_df = email_freq_feature_df.loc[mask]

print("done heavy part...")

#create a copy of orig df
email_freq_feature_cmnty_df = email_freq_feature_df.copy()
email_freq_feature_peer_df = email_freq_feature_df.copy()

# 1. in email_freq_feature_df add community and peer column with id as values.
usr_cmnty_map = pickle_restore("pickle/community_louvian_file")
email_freq_feature_cmnty_df['cmnty']=email_freq_feature_cmnty_df.apply(lambda row: usr_cmnty_map[row.user], axis=1)

usr_peer_map = pickle_restore("pickle/eid_role_map_file")
email_freq_feature_peer_df['peer']=email_freq_feature_peer_df.apply(lambda row: usr_peer_map[row.user], axis=1)


done heavy part...


In [6]:
# Convert date back to string format
email_freq_feature_df['date']=email_freq_feature_df['date'].astype(str)
email_freq_feature_cmnty_df['date']=email_freq_feature_cmnty_df['date'].astype(str)
email_freq_feature_peer_df['date']=email_freq_feature_peer_df['date'].astype(str)


In [7]:

def prep_cbp_dic(email_freq_feature_cmnty_df):
    #convert date column of srt type to Timestamp type.
    email_freq_feature_cmnty_df['date'] = email_freq_feature_cmnty_df['date'].apply(dateutil.parser.parse, dayfirst=True)
    #now convert date column of Timestamptype to only date values and remove hr:mm:ss
    email_freq_feature_cmnty_df['date'] = pd.to_datetime(email_freq_feature_cmnty_df['date']).dt.date

    # create user, date wise groups and count unique dates then add a separate column for the counts
    email_freq_feature_cmnty_df=email_freq_feature_cmnty_df.groupby(['cmnty', 'date']).date.agg('count').to_frame('email_freq').reset_index()

    cmnty_len = Counter(usr_cmnty_map.values())
    #make values of email freq avg of community
    email_freq_feature_cmnty_df['email_freq'] = email_freq_feature_cmnty_df.apply( lambda x: x.email_freq/cmnty_len[x.cmnty], axis=1)



    #groupby user now To populate a dictionary for each user.
    grp=email_freq_feature_cmnty_df.groupby(['cmnty'])

    email_freq_feature_cmnty_dic={} #user:df with feature value
    #iterating groups
    for name, group in grp:
        # print (name)
        # print (group)
        email_freq_feature_cmnty_dic[name]=group
        # ldf = group.groupby(['date'], as_index=False)['cntr'].size()



    return email_freq_feature_cmnty_dic

def prep_pbp_dic(email_freq_feature_peer_df):
    #convert date column of srt type to Timestamp type.
    email_freq_feature_peer_df['date'] = email_freq_feature_peer_df['date'].apply(dateutil.parser.parse, dayfirst=True)
    #now convert date column of Timestamptype to only date values and remove hr:mm:ss
    email_freq_feature_peer_df['date'] = pd.to_datetime(email_freq_feature_peer_df['date']).dt.date

    # create user, date wise groups and count unique dates then add a separate column for the counts
    email_freq_feature_peer_df=email_freq_feature_peer_df.groupby(['peer', 'date']).date.agg('count').to_frame('email_freq').reset_index()

    cmnty_len = Counter(usr_peer_map.values())
    #make values of email freq avg of community
    email_freq_feature_peer_df['email_freq'] = email_freq_feature_peer_df.apply( lambda x: x.email_freq/cmnty_len[x.peer], axis=1)

    #groupby user now To populate a dictionary for each user.
    grp=email_freq_feature_peer_df.groupby(['peer'])

    email_freq_feature_peer_dic={} #user:df with feature value
    #iterating groups
    for name, group in grp:
        # print (name)
        # print (group)
        email_freq_feature_peer_dic[name]=group
        # ldf = group.groupby(['date'], as_index=False)['cntr'].size()
    return email_freq_feature_peer_dic

def prep_ubp_dic(email_freq_feature_df):
    # shorten the data for debug
    # tmp_email_freq_feature_df = email_freq_feature_df #[:1000]

    #convert date column of srt type to Timestamp type.
    email_freq_feature_df['date'] = email_freq_feature_df['date'].apply(dateutil.parser.parse, dayfirst=True)
    #now convert date column of Timestamptype to only date values and remove hr:mm:ss
    email_freq_feature_df['date'] = pd.to_datetime(email_freq_feature_df['date']).dt.date

    #creates user, date wise groups and each internal group is a dataframe.
    # grp=tmp_email_freq_feature_df.groupby(['user', 'date'])

    # create user, date wise groups and count unique dates then add a separate column for the counts
    email_freq_feature_df=email_freq_feature_df.groupby(['user', 'date']).date.agg('count').to_frame('email_freq').reset_index()

    #groupby user now To populate a dictionary for each user.
    grp=email_freq_feature_df.groupby(['user'])

    email_freq_feature_dic={} #user:df with feature value
    #iterating groups
    for name, group in grp:
        # print (name)
        # print (group)
        email_freq_feature_dic[name]=group
        # ldf = group.groupby(['date'], as_index=False)['cntr'].size()
    
    return email_freq_feature_dic

In [None]:

# #UBP anom calculation and plot generation:
# # 1. Take prepared feature dataframe and 
# # 2. index by date
# # 3. populate missing dates
# # 4. get X and Y and call ADEM.

# def anom_calc_ubp(df, feature_name, uname, ws=10, sig=3):
#     # index email data by date
#     df['date'] = pd.to_datetime(df['date'])
#     df.set_index('date', inplace=True)

#     #Fill the missing dates and corresponding feature values as 0.
#     df = df.resample('D').sum().fillna(0)


#     X = df.index.to_series() #debug_df['date'] # data_as_frame['Months']
#     Y = df[feature_name] #data_as_frame['SunSpots']

#     # # plot the results
#     # plot_results(X, y=Y, window_size=20, text_xlabel="Months", sigma_value=3, text_ylabel="No. of Sun spots")

#     events = explain_anomalies(X, Y, window_size=ws, sigma=sig)
#     # Display the anomaly dict
#     print("Information about the anomalies model:{}".format(events))
#     print()

#     events_rolling = explain_anomalies_rolling_std(X, Y, window_size=ws, sigma=sig)
#     # Display the anomaly dict
#     print("Information about the anomalies model rolling:{}".format(events_rolling))
#     print()

#     #####################################################################################################

#     plot_res (events, Y, baseline_type='UBP', feature_type=feature_name, user_name=uname)
#     # plot_res (events_rolling, Y, baseline_type='UBP-rolling', feature_type=feature_name)

# # email_freq_feature_dic.keys()
# # Feature data format:  user: df [user, date, featurecolmn]
# def anom_calc_cbp(df, df_cmnty, feature_name, uname, ws=10, sig=3):
#     df_cmnty['date'] = pd.to_datetime(df_cmnty['date'])
#     df_cmnty.set_index('date', inplace=True)
#     #Fill the missing dates and corresponding feature values as 0.
#     df_cmnty = df_cmnty.resample('D').sum().fillna(0)

#     baseline_df = df_cmnty
    
#     # def anom_calc_cbp(df, baseline_df, feature_name, uname):
#     # index email data by date
#     df['date'] = pd.to_datetime(df['date'])
#     df.set_index('date', inplace=True)

#     #Fill the missing dates and corresponding feature values as 0.
#     df = df.resample('D').sum().fillna(0)

#     #assuming baseline_x is date indexed and missed dates already filled
#     X = baseline_df[feature_name] #debug_df['date'] # data_as_frame['Months']
#     Y = df[feature_name] #data_as_frame['SunSpots']

#     # # plot the results
#     # plot_results(X, y=Y, window_size=20, text_xlabel="Months", sigma_value=3, text_ylabel="No. of Sun spots")

#     events = explain_anomalies_cmnty(X, Y, window_size=ws, sigma=sig)
#     # Display the anomaly dict
#     print("Information about the anomalies model:{}".format(events))
#     print()

#     events_rolling = explain_anomalies_rolling_std_cmnty(X, Y, window_size=ws, sigma=sig)
#     # Display the anomaly dict
#     print("Information about the anomalies model rolling:{}".format(events_rolling))
#     print()

#     #####################################################################################################

#     plot_res_cmnty (events, X, Y, baseline_type='CBP', feature_type=feature_name, user_name=uname)
#     # plot_res (events_rolling, Y, baseline_type='UBP-rolling', feature_type=feature_name)

# # email_freq_feature_dic.keys()
# # Feature data format:  user: df [user, date, featurecolmn]
# def anom_calc_pbp(df, df_peer, feature_name, uname, ws=10, sig=3):
#     df_peer['date'] = pd.to_datetime(df_peer['date'])
#     df_peer.set_index('date', inplace=True)
#     #Fill the missing dates and corresponding feature values as 0.
#     df_peer = df_peer.resample('D').sum().fillna(0)

#     # anom_calc_cbp(df, df_cmnty, 'email_freq', 'AAM0658')

#     baseline_df = df_peer

#     # def anom_calc_cbp(df, baseline_df, feature_name, uname):
#     # index email data by date
#     df['date'] = pd.to_datetime(df['date'])
#     df.set_index('date', inplace=True)

#     #Fill the missing dates and corresponding feature values as 0.
#     df = df.resample('D').sum().fillna(0)

#     #assuming baseline_x is date indexed and missed dates already filled
#     X = baseline_df[feature_name] #debug_df['date'] # data_as_frame['Months']
#     Y = df[feature_name] #data_as_frame['SunSpots']

#     # # plot the results
#     # plot_results(X, y=Y, window_size=20, text_xlabel="Months", sigma_value=3, text_ylabel="No. of Sun spots")

#     events = explain_anomalies_cmnty(X, Y, window_size=ws, sigma=sig)
#     # Display the anomaly dict
#     print("Information about the anomalies model:{}".format(events))
#     print()

#     events_rolling = explain_anomalies_rolling_std_cmnty(X, Y, window_size=ws, sigma=sig)
#     # Display the anomaly dict
#     print("Information about the anomalies model rolling:{}".format(events_rolling))
#     print()

#     #####################################################################################################

#     plot_res_cmnty (events, X, Y, baseline_type='PBP', feature_type=feature_name, user_name=uname)
#     # plot_res (events_rolling, Y, baseline_type='UBP-rolling', feature_type=feature_name)


In [8]:

#now df is ready to groupby [user, date], [cmnty, date], [peer,date]
########email_freq_feature_dic = prep_ubp_dic(email_freq_feature_df)
email_freq_feature_dic = prep_ubp_dic(email_freq_feature_df)

email_freq_feature_peer_dic = prep_pbp_dic(email_freq_feature_peer_df)

email_freq_feature_cmnty_dic = prep_cbp_dic(email_freq_feature_cmnty_df)


pickle_store("pickle/f1/f1_ubp_file", email_freq_feature_dic) #raw means not indexed by Timeindex
pickle_store("pickle/f1/f1_pbp_file", email_freq_feature_peer_dic) #raw means not indexed by Timeindex
pickle_store("pickle/f1/f1_cbp_file", email_freq_feature_cmnty_dic) #raw means not indexed by Timeindex



In [None]:
email_freq_feature_dic = pickle_restore("pickle/f1/f1_ubp_file") #raw means not indexed by Timeindex
email_freq_feature_peer_dic = pickle_restore("pickle/f1/f1_pbp_file") #raw means not indexed by Timeindex
email_freq_feature_cmnty_dic = pickle_restore("pickle/f1/f1_cbp_file") #raw means not indexed by Timeindex


In [None]:
count = 0

print('PBP')
pbp_events_dic={}
for key in email_freq_feature_dic:
    usr=key #'AAM0658'
    debug_df = email_freq_feature_dic[usr]
    debug_peer_df = email_freq_feature_peer_dic[usr_peer_map[usr]]

    #do a depp copy of preped feature dataframe
    df = debug_df.copy()
    df_peer = debug_peer_df.copy()
    feature_name = 'email_freq'
    uname = key
    anom_ev=anom_calc_pbp(df, df_peer, feature_name, uname, ws=10, sig=3)

    if len(anom_ev['anomalies_dict']):
        pbp_events_dic[key]=anom_ev
    
    # count += 1
    # if count%5 == 0:
    #     break
print('CBP')
cbp_events_dic={}
for key in email_freq_feature_dic:
    usr=key
    debug_df = email_freq_feature_dic[usr]
    debug_cmnty_df = email_freq_feature_cmnty_dic[usr_cmnty_map[usr]]

    #do a depp copy of preped feature dataframe
    df = debug_df.copy()
    df_cmnty = debug_cmnty_df.copy()

    feature_name = 'email_freq'
    uname = key
    anom_ev=anom_calc_cbp(df, df_cmnty, feature_name, uname, ws=10, sig=3)

    if len(anom_ev['anomalies_dict']):
        cbp_events_dic[key]=anom_ev

    
    # count += 1
    # if count%5 == 0:
    #     break

print('UBP')
ubp_events_dic={}
# email_freq_feature_dic.keys()
# Feature data format:  user: df [user, date, featurecolmn]
for key in email_freq_feature_dic:
    usr=key
    debug_df = email_freq_feature_dic[usr]


    #do a depp copy of preped feature dataframe
    df = debug_df.copy()
    anom_ev = anom_calc_ubp(df, 'email_freq', usr, ws=10, sig=3)

    if len(anom_ev['anomalies_dict']):
        ubp_events_dic[key]=anom_ev

    
    # count += 1
    # if count%100 == 0:
    #     break

In [14]:
#post process anomalies: delete users with 0 email freq.
#do this to ensure that anom with 0 email freq value are trimmed as they are false positives. 
f1_anom_user_lst=[]
# for anom_usr in ubp_events_dic:
#     a_ev = ubp_events_dic[anom_usr]
#     print('dict a_ev=', a_ev['anomalies_dict'])
#     for k, v in a_ev['anomalies_dict'].items():
#         if v != 0:
#             f1_anom_user_lst.append(anom_usr)
#             break

f1_anom_user_lst += get_anom_list(ubp_events_dic)
f1_anom_user_lst += get_anom_list(pbp_events_dic)
f1_anom_user_lst += get_anom_list(cbp_events_dic)


# for anom_usr in pbp_events_dic:
#     a_ev = pbp_events_dic[anom_usr]
#     print('dict a_ev=', a_ev['anomalies_dict'])
#     for k, v in a_ev['anomalies_dict'].items():
#         if v != 0:
#             f1_anom_user_lst.append(anom_usr)
#             break

# for anom_usr in cbp_events_dic:
#     a_ev = cbp_events_dic[anom_usr]
#     print('dict a_ev=', a_ev['anomalies_dict'])
#     for k, v in a_ev['anomalies_dict'].items():
#         if v != 0:
#             f1_anom_user_lst.append(anom_usr)
#             break


In [19]:
len(set(f1_anom_user_lst))

# save list of anom users
pickle_store("pickle/f1/f1_anom_set_file", set(f1_anom_user_lst)) #raw means not indexed by Timeindex
