# Use basic probability to identify anomalous requests. Using the methods covered in this lesson, examine the rest of the features in the api access logs data set.

In [3]:
# essential imports
import numpy as np
import pandas as pd
# visuals
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn import metrics

# credentials
import env


In [4]:
# function to deal with parsing one entry in our log data
def parse_log_entry(entry):
    parts = entry.split()
    output = {}
    output['ip'] = parts[0]
    output['timestamp'] = parts[3][1:].replace(':', ' ', 1)
    output['request_method'] = parts[5][1:]
    output['request_path'] = parts[6]
    output['http_version'] = parts[7][:-1]
    output['status_code'] = parts[8]
    output['size'] = int(parts[9])
    output['user_agent'] = ' '.join(parts[11:]).replace('"', '')
    return pd.Series(output)

In [6]:
url = f'mysql+pymysql://{env.username}:{env.password}@{env.host}/logs'
df = pd.read_sql('SELECT * FROM api_access', url)
# df = pd.concat([df.entry, df.entry.apply(parse_log_entry)], axis=1)


In [8]:
df = df.entry.apply(parse_log_entry)

In [11]:
df.shape

(13974, 8)

# explore

In [19]:
def value_counts_and_frequencies(s: pd.Series, dropna=True) -> pd.DataFrame:
    return pd.merge(
        s.value_counts(dropna=False).rename('count'),
        s.value_counts(dropna=False, normalize=True).rename('proba'),
        left_index=True,
        right_index=True,
    )


# IP

In [20]:
ip_df = value_counts_and_frequencies(df.ip)


In [21]:
ip_df

Unnamed: 0,count,proba
97.105.19.58,11998,0.858595
173.173.113.51,1059,0.075784
72.181.113.170,613,0.043867
72.181.105.81,246,0.017604
24.26.242.9,21,0.001503
68.201.219.223,21,0.001503
70.121.214.34,2,0.000143
52.87.230.102,2,0.000143
35.175.171.137,2,0.000143
54.145.52.184,1,7.2e-05


# probability 

In [22]:
status_given_ip = (
    df.groupby('ip')
    .status_code.value_counts(normalize=True)
    .rename('proba_status_given_ip')
    .reset_index()
)
status_given_ip

Unnamed: 0,ip,status_code,proba_status_given_ip
0,173.173.113.51,200,1.0
1,24.26.242.9,200,1.0
2,3.88.129.158,200,1.0
3,3.92.201.136,200,1.0
4,34.207.64.242,200,1.0
5,34.229.70.250,200,1.0
6,35.174.209.2,200,1.0
7,35.175.171.137,200,1.0
8,45.23.250.16,200,1.0
9,52.87.230.102,200,1.0


# request_method

In [25]:
rm_df = value_counts_and_frequencies(df.request_method)

In [26]:
rm_df

Unnamed: 0,count,proba
GET,13974,1.0


In [31]:
status_given_ip = (
    df.groupby('ip')
    .request_method.value_counts(normalize=True)
    .rename('proba_request')
    .reset_index()
)
status_given_ip

Unnamed: 0,ip,request_method,proba_request
0,173.173.113.51,GET,1.0
1,24.26.242.9,GET,1.0
2,3.88.129.158,GET,1.0
3,3.92.201.136,GET,1.0
4,34.207.64.242,GET,1.0
5,34.229.70.250,GET,1.0
6,35.174.209.2,GET,1.0
7,35.175.171.137,GET,1.0
8,45.23.250.16,GET,1.0
9,52.87.230.102,GET,1.0


# request_path

In [32]:
rm_df = value_counts_and_frequencies(df.request_path)

In [33]:
request_path_ip = (
    df.groupby('ip')
    .request_path.value_counts(normalize=True)
    .rename('proba_request_path')
    .reset_index()
)
request_path_ip

Unnamed: 0,ip,request_path,proba_request_path
0,173.173.113.51,/api/v1/items,0.060434
1,173.173.113.51,/api/v1/items?page=2,0.058546
2,173.173.113.51,/api/v1/items?page=3,0.058546
3,173.173.113.51,/api/v1/stores,0.050992
4,173.173.113.51,/api/v1/sales,0.007554
...,...,...,...
811,97.105.19.58,/api/v1/items?page=99999999999999999998,0.000083
812,97.105.19.58,/api/v1/stores?page=2,0.000083
813,97.105.19.58,/api/v1/stores?page=666,0.000083
814,97.105.19.58,/api/v1/stores?page=999,0.000083


In [35]:
http_version_df = value_counts_and_frequencies(df.http_version)

In [36]:
http_version_df

Unnamed: 0,count,proba
HTTP/1.1,13974,1.0


In [39]:
http_version_ip = (
    df.groupby('ip')
    .http_version.value_counts(normalize=True)
    .rename('proba_http_version_df')
    .reset_index()
)
http_version_ip

Unnamed: 0,ip,http_version,proba_http_version_df
0,173.173.113.51,HTTP/1.1,1.0
1,24.26.242.9,HTTP/1.1,1.0
2,3.88.129.158,HTTP/1.1,1.0
3,3.92.201.136,HTTP/1.1,1.0
4,34.207.64.242,HTTP/1.1,1.0
5,34.229.70.250,HTTP/1.1,1.0
6,35.174.209.2,HTTP/1.1,1.0
7,35.175.171.137,HTTP/1.1,1.0
8,45.23.250.16,HTTP/1.1,1.0
9,52.87.230.102,HTTP/1.1,1.0


# user_agent

In [40]:
user_agent_df = value_counts_and_frequencies(df.user_agent)

In [41]:
user_agent_df

Unnamed: 0,count,proba
python-requests/2.21.0,12001,0.858809
python-requests/2.20.1,1911,0.136754
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",34,0.002433
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0,8,0.000572
Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots),7,0.000501
Slackbot 1.0 (+https://api.slack.com/robots),6,0.000429
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",4,0.000286
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",2,0.000143
Python-urllib/3.7,1,7.2e-05


In [42]:
user_agent_ip = (
    df.groupby('ip')
    .user_agent.value_counts(normalize=True)
    .rename('proba_user_agent_df')
    .reset_index()
)
user_agent_ip

Unnamed: 0,ip,user_agent,proba_user_agent_df
0,173.173.113.51,python-requests/2.21.0,1.0
1,24.26.242.9,python-requests/2.21.0,1.0
2,3.88.129.158,Slackbot-LinkExpanding 1.0 (+https://api.slack...,1.0
3,3.92.201.136,Slackbot-LinkExpanding 1.0 (+https://api.slack...,1.0
4,34.207.64.242,Slackbot 1.0 (+https://api.slack.com/robots),1.0
5,34.229.70.250,Slackbot 1.0 (+https://api.slack.com/robots),1.0
6,35.174.209.2,Slackbot 1.0 (+https://api.slack.com/robots),1.0
7,35.175.171.137,Slackbot-LinkExpanding 1.0 (+https://api.slack...,1.0
8,45.23.250.16,python-requests/2.21.0,1.0
9,52.87.230.102,Slackbot 1.0 (+https://api.slack.com/robots),0.5


In [34]:
df.head(1)

Unnamed: 0,ip,timestamp,request_method,request_path,http_version,status_code,size,user_agent
0,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0


# Size

In [54]:
df['size'].nunique() 

187