In [297]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
import urllib.request
import json

In [298]:
def html_to_df(url, clean_empty = False , attrs = {}, helper = None):
    
    html_content = requests.get(url).text
    soup = bs(html_content, "lxml")
    table = str(soup.find("table", attrs=attrs))
    df = pd.read_html(str(table))[0]
    
    if clean_empty :
        df = df.loc[:, ~df.isnull().all(axis = 0)]
        
    if helper:
        df = helper(df)   
    return df

In [299]:
url = "https://github.com/ayedaemon/RuckSack-Python/blob/master/log_analysis/access_log"
url1 = "https://github.com/ayedaemon/RuckSack-Python/blob/master/log_analysis/access_log2"
url2 = "https://github.com/robert456456456456/Web_server_log_parser/blob/master/devops.log"

In [300]:
def helper(df):
    df = pd.DataFrame(list(df[1].apply(lambda x: x.split())))
    df[3] = df[3].apply(lambda x: x.replace('[','') )
    df[4] = df[4].apply(lambda x: x.replace(']','') )
    df[5] = df.apply(lambda x : ' '.join([x[5],x[6]]).replace('"',''), axis = 1)
    df.drop([1,2,7,6],inplace=True,axis = 1)
    
    return df

In [301]:
def log_parser(str_):
    '''
    input:
        str_ : log string.
    output:
        return a dictionary which contain all element of log string.
    '''
    find = {}
    find['ip'] = str_.split()[0]
    find['RFC931'] = str_.split()[1]
    find['user'] = str_.split()[2]
    find['date'] = str_.split('[')[1].split()[0]
    find['gmt'] = str_.split('[')[1].split()[1].strip(']')
    try:
        if 'HTTP' in str_.split('"')[1].split()[-1]:
            find['action'] =  str_.split('"')[1].replace(str_.split('"')[1].split()[2],'').strip()
        else:
            find['action'] =  str_.split('"')[1].strip()
    except:
        find['action'] = '-'
    try:
        find['status'] = str_.split('"')[2].strip().split()[0]
    except:
        find['status'] = '-'
    try:
        find['size'] = str_.split('"')[2].strip().split()[1]
    except:
        find['size'] = '-'
    try:
        find['referrer'] = str_.strip().split('"')[3]
    except:
        find['referrer'] ='-'
    try:
        find['browser'] = str_.strip().split('"')[5]
    except:
        find['browser'] = '-'
        
    return find


In [302]:
def log_df(df,col_name ,columns = [] ):

    df = pd.DataFrame(list(df[col_name].apply(log_parser).values) )
    if len(df.columns) == len(columns):
        df.columns = columns
    return df

In [303]:
df1 = html_to_df(url,clean_empty=True)
df2 = html_to_df(url1,clean_empty=True)
df = pd.concat([df1, df2],ignore_index=True)

In [304]:
columns = ["User_ID","RFC931","User","date","gmt","action","status","size","referrer","browser"]
df=log_df(df,1,columns)

In [305]:
df.head()

Unnamed: 0,User_ID,RFC931,User,date,gmt,action,status,size,referrer,browser
0,127.0.0.1,-,-,15/Jan/2021:08:54:07,0,GET /,403,4006,-,curl/7.61.1
1,127.0.0.1,-,-,15/Jan/2021:08:55:23,0,GET /cgi-bin/weakform.py,403,199,-,curl/7.61.1
2,127.0.0.1,-,-,15/Jan/2021:09:01:38,0,GET /cgi-bin/weak_form.py,200,19,-,curl/7.61.1
3,172.17.0.1,-,-,15/Jan/2021:09:02:14,0,GET /,403,4288,-,Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko...
4,172.17.0.1,-,-,15/Jan/2021:09:02:15,0,GET /noindex/common/css/bootstrap.min.css,200,99548,http://172.17.0.2/,Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko...


In [306]:
def XSS_finder(str_):
    flag = 0
    l1 = ['<','>','\\','`']
    l2 = ['/',')','(']
    for char in l1:
        aasci_encoding = '%'+hex(ord(char)).replace('0x','')
        if (char in str_) or (aasci_encoding in str_):
            flag = 1
            break
    if flag != 1:
        try:
            req_str = ''.join(str_.split('?')[1:])
            for char in l2:
                aasci_encoding = '%'+hex(ord(char)).replace('0x','')
                if (char in req_str) or (aasci_encoding in req_str):
                    flag = 1
                    break
        except:
            pass
    return bool(flag)

In [307]:
df['XSS_attack']=df['action'].apply(XSS_finder)

In [308]:
df

Unnamed: 0,User_ID,RFC931,User,date,gmt,action,status,size,referrer,browser,XSS_attack
0,127.0.0.1,-,-,15/Jan/2021:08:54:07,+0000,GET /,403,4006,-,curl/7.61.1,False
1,127.0.0.1,-,-,15/Jan/2021:08:55:23,+0000,GET /cgi-bin/weakform.py,403,199,-,curl/7.61.1,False
2,127.0.0.1,-,-,15/Jan/2021:09:01:38,+0000,GET /cgi-bin/weak_form.py,200,19,-,curl/7.61.1,False
3,172.17.0.1,-,-,15/Jan/2021:09:02:14,+0000,GET /,403,4288,-,Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko...,False
4,172.17.0.1,-,-,15/Jan/2021:09:02:15,+0000,GET /noindex/common/css/bootstrap.min.css,200,99548,http://172.17.0.2/,Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko...,False
...,...,...,...,...,...,...,...,...,...,...,...
1795,172.17.0.1,-,-,16/Jan/2021:13:02:17,+0000,GET /cgi-bin/weak_form.py?TWKAXWKIA=KOC,200,323,http://www.google.com/?q=SHJUUEA,Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US...,False
1796,172.17.0.1,-,-,16/Jan/2021:13:02:18,+0000,GET /cgi-bin/weak_form.py?TRHYOWPDR=HRBCPR,200,323,http://www.usatoday.com/search/results?q=UKLAE...,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,False
1797,172.17.0.1,-,-,16/Jan/2021:13:02:21,+0000,GET /cgi-bin/weak_form.py?OQUED=RCPFFTG,200,323,http://www.usatoday.com/search/results?q=CRLDA...,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,False
1798,172.17.0.1,-,-,16/Jan/2021:13:02:18,+0000,GET /cgi-bin/weak_form.py?JVCSMGFH=CCJ,200,323,http://www.google.com/?q=CJBONBRR,Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1...,False


In [309]:
df[df['XSS_attack']==True]

Unnamed: 0,User_ID,RFC931,User,date,gmt,action,status,size,referrer,browser,XSS_attack
754,172.17.0.1,-,-,15/Jan/2021:10:05:35,0,GET /cgi-bin/weak_form.py?fname=asdasd%3C%2Fp%...,200,355,http://172.17.0.2/cgi-bin/weak_form.py?fname=s...,Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko...,True
757,172.17.0.1,-,-,16/Jan/2021:05:29:02,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,328,-,curl/7.74.0,True
758,172.17.0.1,-,-,16/Jan/2021:05:30:04,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,340,-,curl/7.74.0,True
759,172.17.0.1,-,-,16/Jan/2021:05:30:42,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,340,-,curl/7.74.0,True
760,172.17.0.1,-,-,16/Jan/2021:05:30:57,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,341,-,curl/7.74.0,True
761,172.17.0.1,-,-,16/Jan/2021:05:31:04,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,357,-,curl/7.74.0,True
762,172.17.0.1,-,-,16/Jan/2021:05:31:24,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,357,-,curl/7.74.0,True
763,172.17.0.1,-,-,16/Jan/2021:05:31:28,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,340,-,curl/7.74.0,True
764,172.17.0.1,-,-,16/Jan/2021:05:31:34,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,357,-,curl/7.74.0,True
765,172.17.0.1,-,-,16/Jan/2021:05:32:02,0,GET /cgi-bin/weak_form.py?fname=abcd&lname=111...,200,357,-,curl/7.74.0,True


## Hits Vs Time 

In [310]:
import plotly.express as px
import plotly.graph_objs as go

In [311]:
df['date'] = pd.to_datetime(df['date'], format = '%d/%b/%Y:%H:%M:%S')

In [312]:
layout = go.Layout(height=600, width=1000,title='Hits Vs Time', xaxis=dict(title='Date',color='orange'),
                   yaxis=dict(title='Hits',color='blue'))

In [313]:
fig = px.line(x=df.date.unique(), y=df.date.value_counts(sort = False))
fig.layout=layout
fig.show()

## Status Vs Time

In [314]:
layout = go.Layout(height=600, width=1000,title='Status Vs Time', xaxis=dict(title='Date',color='orange'),
                   yaxis=dict(title='Status',color='green'))
fig = px.line(x=df['date'], y=df['status'])
fig.layout=layout
fig.show()

## Most and Least Visited Customers

In [315]:
def most_least_visited_customer(df,n=1,m=1):
    result = {}
    result['Most visited customer'] = list(df['User_ID'].value_counts().head(n).index)
    result['Least visited customer'] = list(df['User_ID'].value_counts().tail(m).index)
    return result

In [316]:
most_least_visited_customer(df)

{'Most visited customer': ['172.17.0.1'],
 'Least visited customer': ['127.0.0.1']}

## Most and Least Visited Webpages

In [317]:
def most_least_visited_webpages(df,n=1,m=1):
    result = {}
    result['Most visited webpage'] = list(df['action'].value_counts().head(n).index)
    result['Least visited webpage'] = list(df['action'].value_counts().tail(m).index)
    return result

In [318]:
most_least_visited_webpages(df)

{'Most visited webpage': ['GET /cgi-bin/weak_form.py'],
 'Least visited webpage': ['GET /cgi-bin/weak_form.py?GCIDXEKGU=KDXKSEE']}

In [319]:
from tqdm.notebook import tqdm

def loc_dict_maker(unq_series):
    '''
    input:
        series: series of unique ip address.
    output:
        res: dict contain country code, lat, long.
    '''
    def ip_loc(x):
        '''
        input:
            x : ip address
        output:
            dictionary which contain country,latitude and longitude.
        '''
        res = {}
        url = "https://geolocation-db.com/jsonp/"+x
        with urllib.request.urlopen(url) as url:
            data = json.loads(url.read().decode().split("(")[1].strip(")"))
        res = {"country_code":data["country_code"], 'latitude':data['latitude'],'longitude':data['longitude']}
        try:
            res['alpha_3'] = pycountry.countries.get(alpha_2=country).alpha_3
        except:
            res['alpha_3'] = 'Not found'
        return res
    
    result = {}
    for ip in tqdm(unq_series):
        result[ip] = ip_loc(ip)
    return result

In [320]:
def add_location_data(df,column,keys):
    def loc_dict_maker(unq_series):
        '''
        input:
            series: series of unique ip address.
        output:
            res: dict contain country code, lat, long.
        '''
        def ip_loc(x):
            '''
            input:
                x : ip address
            output:
                dictionary which contain country,latitude and longitude.
            '''
            res = {}
            url = "https://geolocation-db.com/jsonp/"+x
            with urllib.request.urlopen(url) as url:
                data = json.loads(url.read().decode().split("(")[1].strip(")"))
            res = {"country_code":data["country_code"], 'latitude':data['latitude'],'longitude':data['longitude']}
            try:
                res['alpha_3'] = pycountry.countries.get(alpha_2=res["country_code"]).alpha_3
            except:
                res['alpha_3'] = 'Not found'
            return res

        result = {}
        for ip in tqdm(unq_series):
            result[ip] = ip_loc(ip)
        return result
    
    loc_dict = loc_dict_maker(df[column].unique())
    for key in keys:
        df[key] = df[column].apply(lambda x: loc_dict[x][key])
    return df  

In [321]:
keys = ['country_code', 'latitude', 'longitude', 'alpha_3']
df = add_location_data(df,'User_ID',keys)
df.head()

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,User_ID,RFC931,User,date,gmt,action,status,size,referrer,browser,XSS_attack,country_code,latitude,longitude,alpha_3
0,127.0.0.1,-,-,2021-01-15 08:54:07,0,GET /,403,4006,-,curl/7.61.1,False,Not found,Not found,Not found,Not found
1,127.0.0.1,-,-,2021-01-15 08:55:23,0,GET /cgi-bin/weakform.py,403,199,-,curl/7.61.1,False,Not found,Not found,Not found,Not found
2,127.0.0.1,-,-,2021-01-15 09:01:38,0,GET /cgi-bin/weak_form.py,200,19,-,curl/7.61.1,False,Not found,Not found,Not found,Not found
3,172.17.0.1,-,-,2021-01-15 09:02:14,0,GET /,403,4288,-,Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko...,False,Not found,Not found,Not found,Not found
4,172.17.0.1,-,-,2021-01-15 09:02:15,0,GET /noindex/common/css/bootstrap.min.css,200,99548,http://172.17.0.2/,Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko...,False,Not found,Not found,Not found,Not found


In [328]:
def geo_plot(country_series,colorbar_title,title_text):
    fig = go.Figure(data=go.Choropleth(
    locations=country_series.value_counts().index, # Spatial coordinates
    z = country_series.value_counts().values, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "Millions USD",
     ))

    fig.update_layout(
    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limite map scope to USA
    )

In [329]:
geo_plot(df[df['alpha_3']!= 'Not found']['alpha_3'],"No. of Users","User's Geolocation Data")

In [324]:
from plotly.offline import  iplot
def geo_plotting(country_series,colorbar_title,title):
    data = dict(
            type = 'choropleth',
            locations = country_series.value_counts().index,
            z = country_series.value_counts().values,
            colorbar = {'title' : colorbar_title},
          )
    layout = dict(
        title = title,
        geo = dict(
                showframe = False,
                projection = {'type':'natural earth'}
        )
    )
    choromap = go.Figure(data = [data],layout = layout)
    iplot(choromap)

In [325]:
geo_plotting(df[df['alpha_3']!= 'Not found']['alpha_3'],"No. of Users","User's Geolocation Data")