In [1]:
import pandas as pd
filename="sample-log.txt"

In [2]:
df = pd.read_csv(filename, sep=",", names =["timestamp","ip_add","statuscode","endpoint"])

In [3]:
df.head()

Unnamed: 0,timestamp,ip_add,statuscode,endpoint
0,2024-03-18 10:15:23,192.168.1.101,200,/home
1,2024-03-18 10:15:25,192.168.1.102,404,/profile/user123
2,2024-03-18 10:15:30,192.168.1.101,200,/about
3,2024-03-18 10:15:45,192.168.1.103,200,/products
4,2024-03-18 10:16:01,192.168.1.104,500,/api/data


# Parse Log Line Function

In [4]:
def get_parse_log_line(filename):
    with open(filename,'r') as logfile:
        for line in logfile:
            (timestamp, ip, status_code, endpoint)= line.strip().split(',')
            parse = timestamp, ip, status_code, endpoint
            print(parse)

In [5]:
# Example

get_parse_log_line(filename)

('2024-03-18 10:15:23', '192.168.1.101', '200', '/home')
('2024-03-18 10:15:25', '192.168.1.102', '404', '/profile/user123')
('2024-03-18 10:15:30', '192.168.1.101', '200', '/about')
('2024-03-18 10:15:45', '192.168.1.103', '200', '/products')
('2024-03-18 10:16:01', '192.168.1.104', '500', '/api/data')
('2024-03-18 10:16:15', '192.168.1.102', '200', '/home')
('2024-03-18 10:16:30', '192.168.1.105', '404', '/blog/post123')
('2024-03-18 10:16:45', '192.168.1.101', '200', '/products')
('2024-03-18 10:17:00', '192.168.1.106', '200', '/home')
('2024-03-18 10:17:15', '192.168.1.107', '403', '/admin')
('2024-03-18 10:17:30', '192.168.1.102', '200', '/about')
('2024-03-18 10:17:45', '192.168.1.108', '200', '/contact')
('2024-03-18 10:18:00', '192.168.1.109', '500', '/api/users')
('2024-03-18 10:18:15', '192.168.1.101', '200', '/home')
('2024-03-18 10:18:30', '192.168.1.110', '200', '/products')
('2024-03-18 10:18:45', '192.168.1.102', '404', '/images/logo.png')
('2024-03-18 10:19:00', '192.16

# Unique Visitors Count

In [6]:
def get_unique_visitors(df):
    return df["ip_add"].nunique()

# Most Popular Endpoints

In [7]:
def get_popular_endpoints(df, top_n=5) :
    end_dict= {}
    for end in df["endpoint"]:
        end_dict[end] = end_dict.get(end,0) + 1

    popular = sorted(end_dict.items(), key = lambda x:x[1], reverse=True)[:top_n]
    return popular

# Error Rate

In [8]:
def get_error_rate(df) :
    errors = 0
    for e in df["statuscode"]:
        if(e>=400):
            errors += 1
    return round((errors/len(df)*100),2)

## FINAL SUMMARY REPORT

In [9]:
def generate_report(filename):
    print(f"Parsing Log Lines :")
    get_parse_log_line(filename)
    df = pd.read_csv(filename, sep=",", names =["timestamp","ip_add","statuscode","endpoint"])
    unique = get_unique_visitors(df)
    pop = get_popular_endpoints(df, top_n=5)
    error_per = get_error_rate(df)

    print(f"\n\n\nUnique Visitors Count : {unique}")
    print(f"Most Popular Endpoints : {pop}")
    print(f"Error Percentage : {error_per}")

In [10]:
generate_report(filename)

Parsing Log Lines :
('2024-03-18 10:15:23', '192.168.1.101', '200', '/home')
('2024-03-18 10:15:25', '192.168.1.102', '404', '/profile/user123')
('2024-03-18 10:15:30', '192.168.1.101', '200', '/about')
('2024-03-18 10:15:45', '192.168.1.103', '200', '/products')
('2024-03-18 10:16:01', '192.168.1.104', '500', '/api/data')
('2024-03-18 10:16:15', '192.168.1.102', '200', '/home')
('2024-03-18 10:16:30', '192.168.1.105', '404', '/blog/post123')
('2024-03-18 10:16:45', '192.168.1.101', '200', '/products')
('2024-03-18 10:17:00', '192.168.1.106', '200', '/home')
('2024-03-18 10:17:15', '192.168.1.107', '403', '/admin')
('2024-03-18 10:17:30', '192.168.1.102', '200', '/about')
('2024-03-18 10:17:45', '192.168.1.108', '200', '/contact')
('2024-03-18 10:18:00', '192.168.1.109', '500', '/api/users')
('2024-03-18 10:18:15', '192.168.1.101', '200', '/home')
('2024-03-18 10:18:30', '192.168.1.110', '200', '/products')
('2024-03-18 10:18:45', '192.168.1.102', '404', '/images/logo.png')
('2024-03-1