In [42]:
from random import randint

def generate_random_data(file_name,
                         max_users = 100, 
                         max_visits = 20, 
                         max_events = 20,
                         max_days = 1,
                        ):
    '''generates random tab separated data in format: userID, visitID, timestamp'''
    
    with open(file_name, "w") as f:
        for uid in range(randint(1, max_users)):
            for vid in range(randint(0, max_visits)):
                ts = randint(0, max_days*86400)
                for eid in range(randint(0, max_events)):
                    f.write('\t'.join(map(str, [uid, vid, ts])) + "\n")
                    ts += randint(0, 3600)
                    ts = min(ts, max_days*86400)
                    

In [43]:
def reader(file_name, separator, header):
    '''reads file line by line'''
    
    with open(file_name, "r") as f:
        for line in f:
            row = dict(zip(header, line.strip().split(separator)))
            yield row

In [44]:
def count(file_name):
    '''counts lines of the document'''
    
    count = 0
    with open(file_name, "r") as f:
        for line in f:
            count += 1
    return count

In [45]:
def count_max(file_name, 
              col_to_aggr,                   
              col_to_cnt,
              separator = "\t",
              header = ["userID", "visitID", "timestamp"], 
              ):
    '''
    counts maximum number of entities (col_to_cnt) grouped within other entity (col_to_aggr)
    col_to_aggr - variable, for which we are looking for the maximum number of counts in col_to_cnt 
    col_to_cnt - variable, which we count toward the maximum
    separator - separator in searched data
    header - names of variables
    '''
    
    max_counted = 0
    last_aggr = None
    last_cnt = None
    counts = 1
    
    for row in reader(file_name, separator, header):
        if last_aggr == row[col_to_aggr]:
            if last_cnt != row[col_to_cnt]:
                counts += 1
        else:
            if counts > max_counted:
                max_counted = counts
            counts = 1

        last_aggr = row[col_to_aggr]
        last_cnt = row[col_to_cnt]

    if counts > max_counted:
        max_counted = counts
            
    return max_counted

In [46]:
from datetime import datetime

def count_unique_filter_hour(file_name, 
                             col_name,
                             separator = "\t", 
                             header = ["userID", "visitID", "timestamp"], 
                             hmin = 16, 
                             hmax = 22, 
                            ):
    '''counts unique entities (col_name) between hours hmin and hmax'''
    
    count_unique = 0
    last = None
    for row in reader(file_name, separator, header):
        hour = int(datetime.fromtimestamp(float(row["timestamp"])).strftime('%H'))
        if hour >= hmin and hour < hmax and row[col_name] != last:
            count_unique += 1
        last = row[col_name]
    return count_unique

In [51]:
import os

def count_max_overlapping_visits(file_name, 
                                separator = "\t", 
                                header = ["userID", "visitID", "timestamp"],
                               ):
    
    '''counts maximum of overlapping visits'''
    
    #writing temporary file with only first and last pageview labeled as SV and EV
    
    last_row = None
    
    with open("temp_data.txt", "w") as f:
        for row in reader(file_name, separator, header):
            if last_row:
                if last_row["visitID"] != row["visitID"]:
                    f.write(separator.join([last_row["timestamp"], "EV"]) + "\n")                
                    f.write(separator.join([row["timestamp"], "SV"]) + "\n")
            else:
                f.write(separator.join([row["timestamp"], "SV"]) + "\n")                
            
            last_row = row
        
        f.write(separator.join([last_row["timestamp"], "EV"]) + "\n")
    
    #sorting temporary file using sort
    os.system("cat temp_data.txt | sort -k1,1n -k2,2r > temp_data_sorted.txt")
    
    #counting SVs ang EVs
    current_count = 0
    max_count = 0
    
    for row in reader("temp_data_sorted.txt", separator, header = ["timestamp", "type"]):
        if row["type"] == "SV":
            current_count += 1
        elif row["type"] == "EV":
            current_count -= 1
    
        if current_count > max_count:
            max_count = current_count
    
    #deleting temporary files 
    os.system("rm -f temp_data.txt temp_data_sorted.txt")
    
    return max_count


In [60]:
generate_random_data("dane.txt")
print "1. Number of pageviews: " + str(count("dane.txt"))
print "2. Maximum number of pageviews in one visit: " + str(count_max("dane.txt", "visitID", "timestamp"))
print "3. Maximum number of visits by one user: " + str(count_max("dane.txt", "userID", "visitID"))
print "4. Number of users whose visits started between 16:00 and 22:00: " + str(count_unique_filter_hour("dane.txt", "visitID"))
print "5. Maximum number of overlapping visits: " + str(count_max_overlapping_visits("dane.txt"))

1. Number of pageviews: 1679
2. Maximum number of pageviews in one visit: 20
3. Maximum number of visits by one user: 17
4. Number of users whose visits started between 16:00 and 22:00: 36
5. Maximum number of overlapping visits: 46
