In [2]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("eclog_1day.csv")

# drop UserId (not present)
df.drop('UserId', axis=1, inplace=True)

# drop UserAgent (unused for now)
df.drop('UserAgent', axis=1, inplace=True)

# drop Referrer (unused for now)
df.drop('Referrer', axis=1, inplace=True)

# drop HttpVersion (unused for now)
df.drop('HttpVersion', axis=1, inplace=True)

# print type of each column
print(df.dtypes)
print("\n")

# get start_timestamp of first row
start_timestamp = df['TimeStamp'].iloc[0]



# get total number of rows
total_number = len(df)
print("Total no. of rows: {}".format(total_number))

# drop if not a GET request
indexNames = df[df['HttpMethod'] != 'GET'].index
df.drop(indexNames , inplace=True)

total_get = len(df)
# get totabl number of GET requests
print("Total no. of GET requests: {}".format(total_get))

# print proportion of GET requests compared to total
print("Proportion of GET requests: {:.2%}".format(total_get/total_number))

# change timestamp to difference from start_timestamp
df['TimeStamp'] = df['TimeStamp'].apply(lambda x: (x-start_timestamp)/10000)

# number of request per object
object_count = df.groupby('Uri')['IpId'].count()

# print(objects_counts)
object_number = len(object_count)
print("Number ob different objects: {}".format(object_number))


IpId            object
TimeStamp        int64
HttpMethod      object
Uri             object
ResponseCode     int64
Bytes            int64
dtype: object


Total no. of rows: 350683
Total no. of GET requests: 288556
Proportion of GET requests: 82.28%
Number ob different objects: 38666


In [6]:
# peek into data
print("\n Data sample:")

df.head(10)
# print(df.tail(10))




 Data sample:


Unnamed: 0,IpId,TimeStamp,HttpMethod,Uri,ResponseCode,Bytes
2,2NL,1000.0,GET,/wyszukiwanie-query1.html,200,9018
8,8NL,8000.0,GET,/p-6692.html,301,0
12,11NL,10000.0,GET,/p-6692.html,200,12722
16,12US,13000.0,GET,/p-9633.html,304,0
19,13DE,15000.0,GET,/wyszukiwanie-query2.html,200,9306
26,18NL,22000.0,GET,/polec-znajomemu-produkt.html/produkt=1917,301,0
27,2NL,24000.0,GET,/polec-znajomemu-produkt.html/produkt=1917,200,8565
29,19NL,26000.0,GET,/p-7597.html,200,11980
34,21US,30000.0,GET,/index.php,301,232
35,21US,30000.0,GET,/,301,0


In [9]:
# object_count = df.groupby('Uri')['IpId'].count()
top_objects = object_count.reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(20)
# print(top_objects.to_string(index=False))
top_objects


Unnamed: 0,Uri,count
26815,/javascript/skrypty.php,8417
26813,/javascript/produkt.php,3784
34105,"/szablony/shop_11.rwd/css/style.php?ncss=style,boxy,moduly,produkt,zebra_datepicker",3314
34103,"/szablony/shop_11.rwd/css/style.php?ncss=style,boxy,moduly,podstrony,listingi",2907
594,/favicon.ico,2061
26809,/javascript/jquery.js,1965
25232,/images/naglowki/image5.jpg,1910
34172,/szablony/shop_3.rwd/obrazki/szablon/rwd_szukaj.png,1866
34171,/szablony/shop_3.rwd/obrazki/szablon/rwd_schowek.png,1851
34169,/szablony/shop_3.rwd/obrazki/szablon/rwd_koszyk.png,1850


In [80]:
object_count.describe()

count    38666.000000
mean         7.462784
std         75.175028
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max       8417.000000
Name: IpId, dtype: float64

In [88]:
# plot request per object
%matplotlib notebook
plot_data = object_count.reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(50)
exp = plot_data.plot(kind='bar', figsize=(9,5), title='Number of requests per Object (top 50)')
exp.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False) # labels along the bottom edge are off
exp.set_xlabel("Object")
exp.set_ylabel("Request per day")
# plt.savefig("top_50.png", bbox_inches='tight', dpi=600)

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Request per day')

In [96]:
# top object: /javascript/skrypty.php
# 60'000 ms = 1 min
import numpy as np
%matplotlib notebook

time_intervalls = np.arange(0, 86460000, 600000)

top = df.loc[df['Uri'] == "/javascript/skrypty.php"]

out = pd.cut(top["TimeStamp"], bins=time_intervalls, include_lowest=True)
ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(9,5), title='Number of request per Time Interval (whole day) \nfor the most popular object')
ax.set_xticks([]) 
ax.set_xlabel("Time Interval (10 min)")
ax.set_ylabel("Request per Interval")
# plt.savefig("requests_10min.png", bbox_inches='tight', dpi=600)

<IPython.core.display.Javascript object>

In [94]:
# top object: /javascript/skrypty.php
# 60'000 ms = 1 min
import numpy as np
%matplotlib notebook

time_intervalls = np.arange(38000000, 39200000, 30000)

top = df.loc[df['Uri'] == "/javascript/skrypty.php"]

out = pd.cut(top["TimeStamp"], bins=time_intervalls, include_lowest=True)
ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(9,5), title='Number of request per Time Interval (total of 20 min) \nfor the most popular object')
plt.setp(ax.get_xticklabels(), visible=False)
ax.set_xlabel("Time Interval (30 sec)")
ax.set_ylabel("Request per Interval")
# plt.savefig("requests_30s.png", bbox_inches='tight', dpi=600)

<IPython.core.display.Javascript object>

In [60]:
# object size
print("Statistics about object size (in Bytes):")
stats = df['Bytes'].describe()
# print(stats)
print("mean: {:.0f} Bytes".format(stats["mean"]))
print("std: {:.0f} Bytes".format(stats["std"]))
print("25%: {:.0f} Bytes".format(stats["25%"]))
print("50%: {:.0f} Bytes".format(stats["50%"]))
print("75%: {:.0f} Bytes".format(stats["75%"]))
print("max: {:.0f} Bytes".format(stats["max"]))

Statistics about object size (in Bytes):
mean: 14814 Bytes
std: 50513 Bytes
25%: 3319 Bytes
50%: 6677 Bytes
75%: 12581 Bytes
max: 2085250 Bytes
