<a href="https://colab.research.google.com/github/amruthamodem/Colour-Detection/blob/main/exp_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
from datetime import datetime, timedelta

# Pattern for the main log line
main_pattern = re.compile(
    r'(?P<ip>\d{1,3}(?:\.\d{1,3}){3}) - - ' # IP address
    r'\[(?P<date>\d{2}/\w{3}/\d{4}):(?P<time>\d{2}:\d{2}:\d{2}) (?P<timezone>[+-]\d{4})\] ' # Timestamp and timezone
    r'"(?P<method>\w+) (?P<url>\S+) (?P<protocol>HTTP/\d\.\d)" ' # Method, URL, and Protocol
    r'(?P<status>\d{3}) ' # Status code
    r'(?P<size>\d+)' # Size
)

def parse_custom_log(line):
    # line: Single string representing a log entry
    m = main_pattern.match(line)
    if m:
        info = m.groupdict()
        # Combine date and time for a full timestamp
        info['timestamp'] = datetime.strptime(f"{info['date']} {info['time']} {info['timezone']}", "%d/%b/%Y %H:%M:%S %z")
        return info
    return None

# Example processing of a sample log file:
results = []
with open('access.log', 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            parsed = parse_custom_log(line)
            if parsed:
                results.append(parsed)

if results:
    df = pd.DataFrame(results)
    # The timestamp is already parsed as datetime with timezone
    df['status'] = pd.to_numeric(df['status'], errors='coerce')
    df['size'] = pd.to_numeric(df['size'], errors='coerce')

    # Now you can proceed with session analysis, top pages, navigation paths
    # For example, to print the head of the DataFrame:
    print(df.head())

    # Example of further analysis (session analysis, top pages, navigation paths)
    # This part was commented out in the original code but can now be implemented
    # Session analysis example:
    df['timestamp'] = pd.to_datetime(df['timestamp']) # Ensure timestamp is datetime

    # Sort by IP and timestamp to ensure correct session grouping
    df = df.sort_values(by=['ip', 'timestamp'])

    # Define session timeout (e.g., 30 minutes)
    session_timeout = timedelta(minutes=30)

    # Calculate time difference between consecutive requests for each IP
    df['time_diff'] = df.groupby('ip')['timestamp'].diff()

    # Identify new sessions (time difference > session_timeout or first request for an IP)
    df['new_session'] = (df['time_diff'] > session_timeout) | (df['time_diff'].isna())

    # Assign session IDs
    df['session_id'] = df.groupby('ip')['new_session'].cumsum()

    # Top Pages
    top_pages = df['url'].value_counts().head()
    print("\nTop 10 Pages Visited:")
    print(top_pages)

    # Navigation Paths (example: sequence of URLs within each session)
    # This is a simplified example and might need refinement based on exact requirements
    navigation_paths = df.groupby(['ip', 'session_id'])['url'].apply(list).reset_index(name='navigation_path')
    print("\nExample Navigation Paths:")
    print(navigation_paths.head())

else:
    print("No valid entries parsed.")

            ip         date      time timezone method             url  \
0  192.168.1.1  10/Oct/2023  10:00:01    +0000    GET     /index.html   
1  192.168.1.2  10/Oct/2023  10:00:05    +0000    GET     /about.html   
2  192.168.1.1  10/Oct/2023  10:01:10    +0000    GET   /contact.html   
3  192.168.1.3  10/Oct/2023  10:02:00    +0000    GET     /index.html   
4  192.168.1.2  10/Oct/2023  10:35:00    +0000    GET  /products.html   

   protocol  status  size                 timestamp  
0  HTTP/1.1     200  1234 2023-10-10 10:00:01+00:00  
1  HTTP/1.1     200  5678 2023-10-10 10:00:05+00:00  
2  HTTP/1.1     200  9101 2023-10-10 10:01:10+00:00  
3  HTTP/1.1     200  1234 2023-10-10 10:02:00+00:00  
4  HTTP/1.1     200  4321 2023-10-10 10:35:00+00:00  

Top 10 Pages Visited:
url
/index.html       2
/contact.html     1
/services.html    1
/about.html       1
/products.html    1
Name: count, dtype: int64

Example Navigation Paths:
            ip  session_id               navigation_path
