In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import OrderedDict, defaultdict
import os
import pyfpgrowth;
from efficient_apriori import apriori

datapath = os.getcwd()
log_data_path = datapath + "/data" #all log data should kept in this folder
result_save_path = datapath + "/result" # all generated txt result save here
if not os.path.exists(result_save_path):
    os.makedirs(result_save_path)

In [2]:
chrome_log = log_data_path + "/Log.txt.strace"
data = pd.read_csv(chrome_log , sep=" ",header=None,names=["Name","Junk1","Junk2","DateTime","Permission","FilePath","Received","BytesAccessed"])

In [3]:
data=data.drop(['Junk1','Junk2'], axis=1)

In [4]:
data['DateTimePermission'] = data['DateTime'].str.cat(data['Permission'])

In [5]:
data=data.drop(['DateTime','Permission'], axis=1)

In [6]:
data['FilePath']=data['FilePath'].str.replace('GET ','')

In [7]:
data['DateTimePermission']=data['DateTimePermission'].str.replace('-0700','')

In [8]:
data=data.rename(index=str,columns={"DateTimePermission":"DateTime"})

In [9]:
data['DateTime'] =  pd.to_datetime(data['DateTime'], format='[%d/%b/%Y:%H:%M:%S]')

In [10]:
"""
Pass the DF before any preprocessing to generate reply log and uncluster baseline txt
"""
def createReplyLog_And_BaseLine(dataFrame):
    replay_Log_Name = os.path.join(result_save_path, "replay_log_user.txt")
    freplay = open(replay_Log_Name,"w+")  
    for file in dataFrame.FilePath:
        freplay.write(file+"\n")
    freplay.close()

    #Write the cluster.index_base :: all unique filePath as unclustered: As baseline to compare our results
    baseCluster_File_Name = os.path.join(result_save_path, "cluster.index_baseline_user.txt")    
    fbaseCluster = open(baseCluster_File_Name,"w+")
    fbaseCluster.write("Unclustered Files:\n")
    for file in data.FilePath.unique():
        if file=="":
            continue
        fbaseCluster.write(file+"\n")

    fbaseCluster.close()

In [11]:
#reply Log and baseline cluster need to be created only Once, If u need uncomment below and run
#createReplyLog_And_BaseLine(data)

In [12]:
data=data.drop(['Name','Received'],axis=1)

In [13]:
print (data.shape)
data=data.drop_duplicates(keep='last') 
print (data.shape)

(687405, 3)
(560449, 3)


In [14]:
#calculating file length based on '/'
datatemp=pd.Series()
datatemp=data.assign(FileLength=data.FilePath.str.count('/'))
datatemp=datatemp.assign(FileLength=datatemp.FileLength-1)

In [15]:
#pre-processed files without a path;example there were many files which had paths like ''
datatemp=datatemp[datatemp.FileLength != -1]
datatemp=datatemp[datatemp.FileLength!=0]

print ("Number of rows in DF:",datatemp.FilePath.size)

Number of rows in DF: 560312


In [16]:
FrequencyDict=datatemp.FilePath.value_counts()

In [17]:
FrequencyDict=FrequencyDict.to_dict()

In [18]:
#calculating the frwquency of each file
def convertFrequencyDict(columns,FrequencyDict):
    frequency=str(columns[0])
    try:
        Frequency=FrequencyDict[frequency]
    except Exception as e:
        Frequency=0
    return Frequency

In [19]:
datatemp['Frequency']=datatemp[['FilePath']].apply(convertFrequencyDict,args=(FrequencyDict,),axis=1)

In [20]:
#grouping files based on inner most subfolder
def groupbysubfolder(columns):
    string=str(columns[0])
    itemlist=string.split('/')
    return itemlist[-2]

In [21]:
datatemp['Sub_Folder']=datatemp[['FilePath']].apply(groupbysubfolder,axis=1)

In [22]:
#Grouping files based on both same time frame and same inner most folder
dataTempGroupBy=datatemp.groupby(['DateTime','Sub_Folder'])

In [23]:
dataTempGroupByList=[]
for x in dataTempGroupBy.groups:
    dataTempGroupByList.append(dataTempGroupBy.get_group(x)['FilePath'].tolist())
    
print ("Data Size:", len(dataTempGroupByList))

Data Size: 141035


In [24]:
"""
It generate 2 txt file: 
    1. Cluster List in txt with your given name of file
       cluster.index fileName Convention: cluster.index_algorithm_supportThreshold_timeWindow
       
    2. All filePath list from dataFrame with yoru given name of file
    
Args:
        patterns_keyList: List of list (each inner list presents a cluster)
        DataFrame_filepath: Data Frame file path list
        Algo_Name: Algorithm to generate this frequent pattern        
        supportThreshold :: used in the algorithm
        TimeWindow: which has been used in our transaction preprocessing
Optional Arguments: 
        Feature :: features consider to create transaction, ex : time only, time+directory
"""
def createClusterFile(patterns_keyList, DataFrame_filepath, algoName, supportThreshold, TimeWindow, feature =""):

    #---------------Results file path-----------------#    
    #append the result directory
    #DataPath_Name = os.path.join(result_save_path, DataPath_Name+".txt")

    cluster_File_Name = "cluster.index_" + algoName + "_Support=" + str(supportThreshold) + "_timeWindow="+ str(TimeWindow) + "_Features="+feature
    #append the result directory
    cluster_File_Name = os.path.join(result_save_path, cluster_File_Name+".txt")              
    print ("Cluster File Name: ", cluster_File_Name)
    #--------------------------------------------------#
    fCluster = open(cluster_File_Name,"w+")    
    count = 1;
    hash_Dict = {}
    for items in patterns_keyList:
        inner_keyList = list(items)
        if(len(inner_keyList)==1): ## remove single file cluster
            continue
        fCluster.write("Cluster %d\n" % (count))
        count = count + 1 
        for i in inner_keyList:
            #print(i)
            fCluster.write("%s\n" %i)
            if i not in hash_Dict:
                hash_Dict[i] = True    

    #create the unclustered filepath
    fCluster.write("Unclustered Files:\n")
    #filePath = open(DataPath_Name, "w+")

    #------------Uncluster Files in cluster.index and all FilePath in DataPath txt-------------#
    for i in DataFrame_filepath:
      #  filePath.write("%s\n"%(i.strip(' '))) #all data frame files
        if i not in hash_Dict:
            fCluster.write("%s\n"%i) #uncluster files
            hash_Dict[i] = True #Removing duplicates in uncluster list

    fCluster.close()  
    #filePath.close()

In [25]:
#---------------closed Itemset calculation from frequent itemset------------
def cal_closed_itemset(itemsetList):

    maximal_itemset = []
    for index_i in range (0,len(itemsetList)): 
        cluster_i_set = set(itemsetList[index_i])    
        flag = True
        for index_j in range (index_i+1, len(itemsetList)):
            cluster_j_set = set(itemsetList[index_j])    
            if(cluster_i_set.issubset(cluster_j_set)):
                #print (cluster_i_set , "<=>", cluster_j_set)
                flag = False
                break
        if(flag):
            maximal_itemset.append(itemsetList[index_i])
    #print ("Maximal Itemset:", maximal_itemset)
    print ("size of maximal Itemsets", len(maximal_itemset))
    return maximal_itemset

In [26]:
#Try FP_Growth algorithm on this transaction set
#print (time_transactions)

#Use find_frequent_patterns to find patterns in baskets that occur over the support threshold:
minSupport = 100
patterns = pyfpgrowth.find_frequent_patterns(dataTempGroupByList, minSupport)  # need to generate data using this param
print ("pattern:")
keyList = []
for key,val in patterns.items():
    keyList.append(key)

print ("\n\n\n")
#print ("All frequent items: ", keyList)
print ("Size of All frequest Itemset: ", len(keyList))
closed_itemset = cal_closed_itemset(keyList)


timed_window = 1 #since it groupby with time and directory
feature = " time+Directory"
createClusterFile(closed_itemset, datatemp.FilePath, "FPGrowth",minSupport,timed_window,feature)



#Use generate_association_rules to find patterns that are associated with another with a certain minimum probability:
#rules = pyfpgrowth.generate_association_rules(patterns, 0.5)  # generate data using this prob 

#print("\nAssociation Rules :")
#for key,val in rules.items():
#    print (key , "=>" , val)

pattern:




Size of All frequest Itemset:  2671
size of maximal Itemsets 814
Cluster File Name:  /Users/Mehedi/BitBucket/AdvancedOS Project/result/cluster.index_FPGrowth_Support=100_timeWindow=1_Features= time+Directory.txt


In [27]:
##Try Apriori Algorithm with features minSup, minCon,
print ("Transactions counts: ",len(dataTempGroupByList))

minSupport = 0.10
itemsets, rules = apriori(dataTempGroupByList, minSupport)
print ("Apriori itemset size", len(itemsets))

#Formating the itemsets 
itemsetList = []
for key,val in itemsets.items():
    temp = ()
    for i in val:
        temp+=(i) #tuple concatenation
    itemsetList.append(tuple(set(temp)))  # remove duplicate from each cluster
print ("\n\nApriori List: ", len(itemsetList))
#--------------------------------//  

#print ("All frequent items: ", itemsetList)
print ("Size of All frequest Itemset: ", len(itemsetList))
closed_itemset = cal_closed_itemset(itemsetList)

timed_window=1 #since it groupby with time and directory
feature = "time+Directory"
createClusterFile(closed_itemset, datatemp.FilePath,"Apriori_cluster",minSupport, timed_window,feature)

    


Transactions counts:  141035
Apriori itemset size 0


Apriori List:  0
Size of All frequest Itemset:  0
size of maximal Itemsets 0
Cluster File Name:  /Users/Mehedi/BitBucket/AdvancedOS Project/result/cluster.index_Apriori_cluster_Support=0.1_timeWindow=1_Features=time+Directory.txt
