In [41]:
#relative paths
import os, sys, inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir  = os.path.dirname(currentdir)

rootdir = parentdir # is it?
dwhdir = os.path.join(rootdir, 'dwh')

activityFilename  = os.path.join(dwhdir, 'activity.log')
dataframeFilename = os.path.join(dwhdir, 'channel_changes.csv')

print('In :', activityFilename)
print('Out:', dataframeFilename)

In : D:\python_virtualenvs\src\pandas_workshop\dwh\activity.log
Out: D:\python_virtualenvs\src\pandas_workshop\dwh\channel_changes.csv


In [2]:
#other imports
import io
import re
import pandas as pd



In [30]:


def get_good_lines(filename, lineFilterStr):
    data = []
    with io.open(filename, "r", encoding='utf-8') as file:
        for line in file:
            isMatch = filter_line(line, lineFilterStr)
            if isMatch:
                data.append(line)
    
    return data




In [31]:



def filter_line(line, filterStr):
    return filterStr in line




In [32]:
from datetime import datetime # another way to measure

startTS = datetime.now()
changeChannelLines = get_good_lines(activityFilename, 'Action=ChangeChannel')

print('Process took',  (datetime.now() - startTS).total_seconds(), 'seconds')

Process took 1.586365 seconds


In [33]:
print(len(changeChannelLines), 'lines') # about 1M lines extracted from a 184MB file in 1.5s

998971 lines


In [34]:
#now we can apply regexps - compile them beforehand
userRE = re.compile('User@([\w]+)')
changeChannelRE = re.compile('UserAction=ChangeChannel\(([\w-]+), ([\w/]+),')

def extract_features(line):
    ts = line[0:19]
    user = userRE.search(line).group(1)
    changeChannel = changeChannelRE.search(line)
    channel = changeChannel.group(1)
    event = changeChannel.group(2)
    return (ts, user, channel, event)
    
#Test it
print (extract_features(changeChannelLines[0]))


('2017-01-02 00:00:00', '7f98c243497046a086b09a84d68aa0f1', 'C-0016', 'E/00904866')


In [35]:
#extract everything to a table (of sorts)
startTS = datetime.now()

datatable = [extract_features(l) for l in changeChannelLines]

print('Extracting features took',  (datetime.now() - startTS).total_seconds(), 'seconds')

Process took 3.053912 seconds


In [37]:
#create a dataframe

startTS = datetime.now()

df = pd.DataFrame(datatable, columns=['Timestamp', 'User', 'Channel', 'Event'])

print('Dataframe Creation took',  (datetime.now() - startTS).total_seconds(), 'seconds')

print(df.head(5))


             Timestamp                              User Channel       Event
0  2017-01-02 00:00:00  7f98c243497046a086b09a84d68aa0f1  C-0016  E/00904866
1  2017-01-02 00:00:01  7d8090b7fd304781a88c77b75207728d  C-0031  E/00904873
2  2017-01-02 00:00:02  0c39574d45c245b79e076650fa948229  C-0026  E/00904872
3  2017-01-02 00:00:03  d48241d6b352465c939403eb16619753  C-0008  E/00904859
4  2017-01-02 00:00:03  3dbaacdba1b849ed9029f69783e15bab  C-0010  E/00904860
Process took 0.181558 seconds


In [43]:
startTS = datetime.now()

df.to_csv(dataframeFilename, sep=',', encoding='utf-8', index=False)

print('CSV Writting took', (datetime.now() - startTS).total_seconds(), 'seconds')

CSV Writting took 3.64728 seconds
