In [160]:
#  write a Python function that reads this log file and returns a DataFrame with the following columns: IP, Date, Time, RequestType, URL, StatusCode, ResponseSize. The DataFrame should be sorted by Date and Time.

# Additionally, the function should also calculate and print the following statistics:

# The total number of requests in the log.
# The top 10 IP addresses by the number of requests.
# The top 5 URLs by the number of requests.

In [161]:
# !pip install Faker
from faker import Faker
import random

fake = Faker()

# Define a list of 10 IP addresses and URLs that will repeat
repeat_ips = [fake.ipv4() for _ in range(10)]
repeat_urls = ["/" + fake.uri_path() for _ in range(10)]

log_data = ""

for _ in range(300):
    # For 20% of the entries, use a repeating IP and URL
    if random.random() < 0.2:
        ip = random.choice(repeat_ips)
        url = random.choice(repeat_urls)
    else:
        ip = fake.ipv4()
        url = "/" + fake.uri_path()

    date = fake.date(pattern="%d/%b/%Y")
    time = fake.time(pattern="%H:%M:%S")
    request_type = random.choice(["GET", "POST", "PUT", "DELETE"])
    status_code = random.choice([200, 301, 400, 404, 500])
    response_size = random.randint(1000, 5000)

    log_data += f"{ip} - - [{date}:{time}] \"{request_type} {url} HTTP/1.1\" {status_code} {response_size}\n"

with open('log.txt', 'w') as f:
    f.write(log_data)


In [162]:
def parse_log_line(line):
    
    # # Split the line into fields
    # fields = line.split()
    
#     # Parse the line
    ip = line[0]
    date_time = line[3].strip('[]').split(':')
    date = date_time[0]
    time = ":".join(date_time[1:])
    request_line = " ".join(line[4:7]).strip('""').split()
    request_type = request_line[0]
    url = request_line[1]
    status_code = line[7]
    response_size = line[8]

    return {'IP': ip, 'Date': date, 'Time': time, 'RequestType': request_type, 'URL': url, 'StatusCode': status_code, 'ResponseSize': response_size}

    # print(ip)
    # print(date)
    # print(time)
    # print(request_type)
    # print(url)
    # print(status_code)
    # print(response_size)


In [168]:
import pandas as pd

def df_from_log(log_file):
    list_lines = []
    
    test_file = log_file
    with open(text_file, 'r') as f:
        for line in f:
            dict_line_parsed =  parse_log_line(line.split())
            list_lines.append(dict_line_parsed)

    df_result = pd.DataFrame(list_lines)
    df_result['Date'] = pd.to_datetime(df_result.Date)
    df_result['Time'] = pd.to_datetime(df_result.Time)
    df_result.sort_values(['Date', 'Time'])
    
    num_total_requests = len(df_result)
    num_duplicate_requests = df_result.duplicated().sum()
    num_distinct_requests = num_total_requests - num_duplicate_requests
    
    top10_IP = df_result.groupby('IP')['IP'].count().sort_values().tail(10)
    top5_URL = df_result.groupby('URL')['URL'].count().sort_values().tail(5)
    
    top10_IP_ = df_result['IP'].value_counts().nlargest(10)
    top5_URL_ = df_result['URL'].value_counts().nlargest(5)
    
    return df_result, num_distinct_requests, top10_IP, top5_URL, top10_IP_, top5_URL_

In [169]:
text_file = 'log.txt'

df_result, num_distinct_requests, top10_IP, top5_URL, top10_IP_, top5_URL_ = df_from_log(text_file)
df_result

  df_result['Time'] = pd.to_datetime(df_result.Time)


Unnamed: 0,IP,Date,Time,RequestType,URL,StatusCode,ResponseSize
0,208.147.104.68,1980-08-12,2024-05-13 17:05:45,DELETE,/posts/search/categories,200,2502
1,136.171.6.197,1978-04-22,2024-05-13 11:17:47,DELETE,/tag/list,400,2291
2,75.250.102.197,1976-06-04,2024-05-13 13:51:17,GET,/tag/search/main,200,3081
3,169.190.77.186,1979-06-16,2024-05-13 18:56:00,POST,/list/categories,200,3015
4,37.231.95.39,2015-09-12,2024-05-13 01:29:48,POST,/list,404,3716
...,...,...,...,...,...,...,...
295,72.140.179.81,1984-12-24,2024-05-13 13:25:36,PUT,/blog,404,3128
296,111.13.52.146,2018-06-27,2024-05-13 16:56:06,GET,/explore/category,200,4355
297,193.42.88.180,1996-03-27,2024-05-13 14:02:52,GET,/tag,400,2888
298,173.23.98.64,1999-03-23,2024-05-13 06:03:26,DELETE,/categories/posts/tag,500,3351


In [170]:
df_from_log(text_file)

  df_result['Time'] = pd.to_datetime(df_result.Time)


(                 IP       Date                Time RequestType   
 0    208.147.104.68 1980-08-12 2024-05-13 17:05:45      DELETE  \
 1     136.171.6.197 1978-04-22 2024-05-13 11:17:47      DELETE   
 2    75.250.102.197 1976-06-04 2024-05-13 13:51:17         GET   
 3    169.190.77.186 1979-06-16 2024-05-13 18:56:00        POST   
 4      37.231.95.39 2015-09-12 2024-05-13 01:29:48        POST   
 ..              ...        ...                 ...         ...   
 295   72.140.179.81 1984-12-24 2024-05-13 13:25:36         PUT   
 296   111.13.52.146 2018-06-27 2024-05-13 16:56:06         GET   
 297   193.42.88.180 1996-03-27 2024-05-13 14:02:52         GET   
 298    173.23.98.64 1999-03-23 2024-05-13 06:03:26      DELETE   
 299   94.115.44.104 2018-12-25 2024-05-13 13:34:47      DELETE   
 
                           URL StatusCode ResponseSize  
 0    /posts/search/categories        200         2502  
 1                   /tag/list        400         2291  
 2            /tag/sear