In [1]:
import pandas as pd
import datetime
import numpy as np

### Read July File

In [85]:
july_raw_df = pd.read_csv("processed_access_log_Jul95.csv", sep=',', names = ["ipaddress", "date", "time", "webpage","bytes_recv"])

In [96]:
july_raw_df.head()

Unnamed: 0,ipaddress,date,time,webpage,bytes_recv,datetime,unix_timestamp,session_id
87820,***.novo.dk,09/Aug/1995,03:02:48,/shuttle/missions/sts-69/mission-sts-69.html,11264,1995-08-09 03:02:48,807937368,1.0
87829,***.novo.dk,09/Aug/1995,03:03:52,/shuttle/countdown/,4673,1995-08-09 03:03:52,807937432,1.0
87839,***.novo.dk,09/Aug/1995,03:05:38,/shuttle/countdown/liftoff.html,4665,1995-08-09 03:05:38,807937538,1.0
87850,***.novo.dk,09/Aug/1995,03:07:40,/shuttle/countdown/lps/fr.html,1879,1995-08-09 03:07:40,807937660,1.0
376395,001.msy4.communique.net,30/Aug/1995,02:55:47,/software/winvn/winvn.html,9630,1995-08-30 02:55:47,809751347,2.0


In [54]:
#july_raw_df["ipaddress"][136820] == july_raw_df["ipaddress"][188797]

### Append datetime

In [86]:
july_raw_df['datetime'] = july_raw_df['date'].map(str) +":" + july_raw_df['time']
july_raw_df['datetime'] = pd.to_datetime(july_raw_df['datetime'], format='%d/%b/%Y:%H:%M:%S', errors="coerce")

In [87]:
# convert to unix timestamp
july_raw_df['unix_timestamp'] = july_raw_df.datetime.values.astype(np.int64) // 10 ** 9

### 1.A Same host irrespective of dates

In [59]:
july_raw_df = july_raw_df.sort_values(by=['ipaddress'])
july_raw_df['next_ipaddress'] = july_raw_df.ipaddress.shift(1)
july_raw_df['session_new'] = july_raw_df['ipaddress'] !=  july_raw_df['next_ipaddress']

In [61]:
# insert session id -- slow way
session_id = 0
for index, row in july_raw_df.iterrows():
    if row['session_new']:
        session_id+=1
    july_raw_df.loc[index, 'session_id'] = int(session_id)

In [62]:
july_raw_df.drop('session_new', axis=1, inplace=True)
july_raw_df.drop('next_ipaddress', axis=1, inplace=True)
# write to csv
july_raw_df.to_csv("sessionize_hostonly_july.csv",index=False)

### 1.B. Same date == same session, else new session

In [88]:
july_raw_df = july_raw_df.sort_values(by=['ipaddress','datetime'])

In [89]:
july_raw_df['next_date'] = july_raw_df.date.shift(1)

In [90]:
july_raw_df['session_new'] = july_raw_df['date'] !=  july_raw_df['next_date']

In [92]:
# insert session id -- slow way
session_id = 0
for index, row in july_raw_df.iterrows():
    if row['session_new']:
        session_id+=1
    july_raw_df.loc[index, 'session_id'] = int(session_id)

In [93]:
july_raw_df.drop('session_new', axis=1, inplace=True)
july_raw_df.drop('next_date', axis=1, inplace=True)

In [94]:
# write to csv
july_raw_df.to_csv("sessionize_date_july.csv",index=False)

### 1.C. Difference between two timestamp <=30 min == same session, else new session

In [12]:
july_raw_df.drop('session_id', axis=1, inplace=True)

In [12]:
# based on difference between timestamp
g = july_raw_df.groupby(['ipaddress', 'date'])
july_raw_df['session_new'] = g['unix_timestamp'].apply(lambda s: (abs(s.shift(1) - s) / 60))

In [15]:
# insert session id -- slow way
session_id = 0
for index, row in july_raw_df.iterrows():
    if pd.isna(row['session_new']) or row['session_new'] > 30:
        session_id+=1
    july_raw_df.loc[index, 'session_id'] = int(session_id)

In [17]:
july_raw_df.drop('session_new', axis=1, inplace=True)
# write to csv
july_raw_df.to_csv("sessionize_time_july.csv",index=False)

### 2. Create events for Sequence mining

In [39]:
processed_df = pd.read_csv("sessionize_hostonly_july_ordered.csv", sep=',')

In [47]:
processed_df.head()

Unnamed: 0,sequenceID,SIZE,webpage,event
0,1,1,/ksc.html,1
1,1,1,/shuttle/missions/missions.html,2
2,1,1,/shuttle/missions/sts-35/mission-sts-35.html,3
3,1,1,/shuttle/missions/sts-35/mission-sts-35.html,4
4,1,1,/shuttle/resources/orbiters/columbia.html,5


In [41]:
processed_df.sort_values(['sequenceID', 'eventID'], ascending=[True, True], inplace=True)

In [42]:
processed_df['seq_copy'] = processed_df.sequenceID.shift(1)

In [43]:
def increment_events(x):  
    global event_idx
    if (x['sequenceID']== x['seq_copy']):
        event_idx+=1
    else:
        event_idx = 1
    return(event_idx)

event_idx=1
processed_df['event'] = processed_df.apply(increment_events,axis=1)

In [46]:
processed_df.drop('eventID', axis=1, inplace=True)
processed_df.drop('seq_copy', axis=1, inplace=True)

In [48]:
processed_df = processed_df[['sequenceID', 'event', 'webpage']]

# write to csv
processed_df.to_csv("sessionize_hostonly_july_ordered_sequence",index=False, sep=' ')