# Fetch Inervals from log file

In [6]:
import pandas as pd
import time
import datetime

def getIntervalTemplate(outputFile):
    # Get intermediate file
    import subprocess
    bashCommand = '''echo "time" > interval_intermediate.csv && cat ../log/keyStroke.log | grep "\-\-\-5min" | grep -o '[0-9]*-[0-9]*-[0-9]*-[0-9]*:[0-9]*:[0-9]*' >> ./../data-processing/interval_intermediate.csv'''
    subprocess.check_output(bashCommand, shell=True)


    raw_time = pd.read_csv("interval_intermediate.csv")
    raw_time = pd.DataFrame(raw_time)

    all_time_stamp = []
    dateTime = str(raw_time.time[0])
    unixTime = time.mktime(datetime.datetime.strptime(dateTime, "%Y-%m-%d-%H:%M:%S").timetuple())
    for i in range(1,len(raw_time.time)):
        new_unixTime = time.mktime(datetime.datetime.strptime(str(raw_time.time[i]), "%Y-%m-%d-%H:%M:%S").timetuple())      
        all_time_stamp.append(datetime.datetime.fromtimestamp(unixTime).strftime('%Y-%m-%d-%H:%M:%S')+"--"+datetime.datetime.fromtimestamp(new_unixTime).strftime('%Y-%m-%d-%H:%M:%S'))
        unixTime = new_unixTime
    df = pd.DataFrame(all_time_stamp, columns=['time_intervals'])
    df.to_csv(outputFile, index=False, sep=' ')
    return df
    
getIntervalTemplate("interval_template.csv")[:5]

Unnamed: 0,time_intervals
0,2018-11-29-16:44:55--2018-11-29-17:36:25
1,2018-11-29-17:36:25--2018-11-29-17:36:44
2,2018-11-29-17:36:44--2018-11-29-17:37:26
3,2018-11-29-17:37:26--2018-11-29-17:45:47
4,2018-11-29-17:45:47--2018-11-29-17:46:34


# Fetch number of windows being touched in 5 mins

In [7]:
def windowTouchesCount():
    count = 0
    start = True
    windowContent = [line.strip() for line in open("./../log/windowAudit.log", 'r')]
    windowCount_feature = []
    for line in windowContent:
        if line != "":
            if "---5min Window Audit milestone" in line:
                if start:
                    start = False
                else:
                    windowCount_feature.append(count)
                    count = 0 
            else:
                count += 1
    df = pd.DataFrame(windowCount_feature, columns=['touched_windows_count'])
    df.to_csv("windows_touched_feature.csv", index=False, sep=' ')
    return df

windowTouchesCount()[:5]

Unnamed: 0,touched_windows_count
0,0
1,0
2,2
3,26
4,0


# Fetch Chrome, WeChat, Terminal lasting time in 5 mins (write in class)

In [102]:
class Interval(object):
    def __init__(self, start, end):
        self.start = start
        self.end = end 
        self.lasting = 0

def unixTime_intervals(element):
    fromTime, toTime = element.split("--")
    # Conver to unix time
    fromTime = time.mktime(datetime.datetime.strptime(fromTime, "%Y-%m-%d-%H:%M:%S").timetuple()) 
    toTime = time.mktime(datetime.datetime.strptime(toTime, "%Y-%m-%d-%H:%M:%S").timetuple()) 
    return fromTime, toTime

def unixTime_windows(element):
    currentStart = element[:31]
    currentEnd = element[32:63]
    # Conver to Unix time
    currentStart = time.mktime(datetime.datetime.strptime(currentStart, "%I:%M:%S%p on %B %d, %Y").timetuple())
    currentEnd = time.mktime(datetime.datetime.strptime(currentEnd, "%I:%M:%S%p on %B %d, %Y").timetuple())
    return currentStart, currentEnd

def buildIntervalObject(dataList):
    intervals = []
    for line in dataList:
        start, end = unixTime_intervals(line)
        intervals.append(Interval(start, end))
    return intervals

def buildWindowObject(dataList):
    intervals = []
    for i in range(len(dataList)):
        if intervals == [] or dataList[i] != dataList[i-1]:
            start, end = unixTime_windows(dataList[i])
            intervals.append(Interval(start, end))
    return intervals

def lastingTimeRatio(appName):
    # Get interval template's time
    raw_intervals = pd.read_csv("interval_template.csv")
    raw_intervals = pd.DataFrame(raw_intervals)
    # package to Interval object
    intervals = buildIntervalObject(raw_intervals.time_intervals)
    
    # clean up windows log
    windowContent = [line.strip() for line in open("./../log/windowAudit.log", 'r')]
    windowContent = [line for line in windowContent if line != "" and "---5min Window Audit milestone" not in line and appName in line]
    # package to Interval object
    appBucket = buildWindowObject(windowContent)


    # Two pointers
    i, j = 0, 0
    appLastingTime_ratio = []
    current = appBucket[0]
    inv = intervals[0]
    
    while j < len(appBucket) and i < len(intervals):
        inv = intervals[i]
        if current.start > inv.end:
            i += 1
            appLastingTime_ratio.append(inv.lasting * 1.0 / (inv.end - inv.start))
        else:
            if current.end <= inv.end:
                j += 1
                inv.lasting += (current.end - current.start)
                if j < len(appBucket):
                    current = appBucket[j]
            else:
                inv.lasting += (min(inv.end, current.end) - current.start)
                current.start = inv.end
                i += 1
                appLastingTime_ratio.append(inv.lasting * 1.0 / (inv.end - inv.start))

    # Traverse the rest time intervals
    while i < len(intervals):
        inv = intervals[i]
        appLastingTime_ratio.append(inv.lasting * 1.0 / (inv.end - inv.start))
        i += 1
    
    df = pd.DataFrame(appLastingTime_ratio, columns=[appName + '_count'])
    df.to_csv(appName + "_count_feature.csv", index=False, sep=' ')
    return df



In [103]:
lastingTimeRatio("Google Chrome")[:5]

Unnamed: 0,Google Chrome_count
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [104]:
lastingTimeRatio("WeChat")[:5]

Unnamed: 0,WeChat_count
0,0.0
1,0.0
2,0.0
3,0.01996
4,0.0


In [105]:
lastingTimeRatio("Terminal")[:5]

Unnamed: 0,Terminal_count
0,0.0
1,0.0
2,0.02381
3,0.670659
4,1.0


# Fetch Chrome Tab Count in 5 mins

In [108]:
def chromeTabCount():
    chromeTabCountContent = [line.strip() for line in open("./../log/chromeTabCount.log", 'r')]
    chromeTabCountContent = [line for line in chromeTabCountContent if line != "" and "---5min Chrome Tab Count Audit milestone" not in line]
    df = pd.DataFrame(chromeTabCountContent, columns=['chrome_tab_count'])
    df.to_csv("chrome_tab_count_feature.csv", index=False, sep=' ')
    return df

chromeTabCount()[:5]

Unnamed: 0,chrome_tab_count
0,35
1,35
2,35
3,35
4,35


# Fetch Chrome Tab's activity in 5 mins

In [109]:
def chromeTabOpen():
    chromeTabActContent = [line.strip() for line in open("./../log/chromeTab.log", 'r')]
    openAct = []
    start = True
    count = 0
    for line in chromeTabActContent:
        if "---5min Chrome Tab Audit milestone" in line:
            # Ignore the first line
            if start:
                start = False
            else:
                openAct.append(count)
                count = 0
        elif "open" in line:
            count += 1
    openAct.append(count)
        
    df = pd.DataFrame(openAct, columns=['chrome_tab_open_act'])
    df.to_csv("chrome_tab_open_feature.csv", index=False, sep=' ')
    return df

chromeTabOpen()[:5]

Unnamed: 0,chrome_tab_open_act
0,0
1,35
2,35
3,140
4,35


In [110]:
def chromeTabClose():
    chromeTabActContent = [line.strip() for line in open("./../log/chromeTab.log", 'r')]
    openAct = []
    start = True
    count = 0
    for line in chromeTabActContent:
        if "---5min Chrome Tab Audit milestone" in line:
            # Ignore the first line
            if start:
                start = False
            else:
                openAct.append(count)
                count = 0
        elif "close" in line:
            count += 1
    openAct.append(count)
        
    df = pd.DataFrame(openAct, columns=['chrome_tab_close_act'])
    df.to_csv("chrome_tab_close_feature.csv", index=False, sep=' ')
    return df

chromeTabClose()[:5]

Unnamed: 0,chrome_tab_close_act
0,0
1,0
2,0
3,0
4,0


# 5 mins filter

In [114]:
# Concatenate features
def concatenate(fileList, outputFileName):
    import pandas as pd

    dfs = []
    for filename in fileList:
        # read the csv, making sure the first two columns are str
        df = pd.read_csv(filename, header=None, converters={0: str, 1: str})
        # throw away all but the first two columns
        df = df.ix[:,:1]
        # change the column names so they won't collide during concatenation
        df.columns = [filename + str(cname) for cname in df.columns]
        dfs.append(df)

    # concatenate them horizontally
    merged = pd.concat(dfs,axis=1)
    # write it out
    merged.to_csv(outputFileName, header=None, index=None)
    return merged


concatenate(["test1.csv", "test2.csv", "test3.csv"], "demo.csv")[:5]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,test1.csv0,test2.csv0,test3.csv0
0,1,a,!
1,2,b,@
2,3,c,#
3,4,d,$
4,5,e,^


In [115]:
def filter(inputFile, outputFile):
    """
    Only Storing those intervals roughly within 5 mins
    """
    import time
    import datetime
    def unixTime_intervals(element):
        fromTime, toTime = element[:40].split("--")
        # Conver to unix time
        fromTime = time.mktime(datetime.datetime.strptime(fromTime, "%Y-%m-%d-%H:%M:%S").timetuple()) 
        toTime = time.mktime(datetime.datetime.strptime(toTime, "%Y-%m-%d-%H:%M:%S").timetuple()) 
        return fromTime, toTime
    
    inputFile = [line.strip() for line in open("./../data_processing/"+inputFile, 'r')]
    for line in inputFile[1:]:
        fromTime, toTime = unixTime_intervals(line)
        if (toTime - fromTime) >= 300 and (toTime - fromTime) <= 305:
            f = open(outputFile, "a+")
            f.write(line+"\n")
            f.close() 
    return

# filter("features_intermediate.csv", "final_fetures_file.csv")