**VRV Security’s Python Intern Assignment : Aniket Tiwari**

Email: tiwarianiket12at@gmail.com

**Task 1**

In [75]:
import pandas as pd
log_file_path = "sample.log"

In [76]:
# Function to process log file using pandas
def analyze_log_with_pandas(log_file):
    
    # Read the log file into a DataFrame
    # Assume the IP address is the first space-separated field
    log_data = pd.read_csv(log_file, sep=' ', header=None, usecols=[0], names=['IP'])

    # Count occurrences of each IP address
    ip_counts = log_data['IP'].value_counts()

    # Convert to a DataFrame for better display and further processing
    requests_per_ip = ip_counts.reset_index()
    requests_per_ip.columns = ['IP Address', 'Request Count']

    # Return the DataFrame
    return requests_per_ip

# Analyze the log file and get requests_per_ip
requests_per_ip_df = analyze_log_with_pandas(log_file_path)

**Task 2**

In [77]:
def find_most_accessed_endpoint(log_file):
    # Assuming the endpoint is the seventh space-separated field (index 6 in zero-based indexing)
    log_data = pd.read_csv(log_file, sep=' ', header=None, usecols=[5], names=['Endpoint'])

    # Clean up the Endpoint column: remove surrounding quotes and extract the path part
    log_data['Endpoint'] = log_data['Endpoint'].str.extract(r'(/[^ ]*)')  # Extract just the path part

    # Count occurrences of each endpoint
    endpoint_counts = log_data['Endpoint'].value_counts()

    # Find the most accessed endpoint
    most_accessed_endpoint = endpoint_counts.idxmax()
    access_count = endpoint_counts.max()

    # Create a DataFrame for the result
    result = pd.DataFrame({'Endpoint': [most_accessed_endpoint], 'Access Count': [access_count]})

    # Return the DataFrame
    return result

# Call the function with the log file path
most_accessed_endpoint_df = find_most_accessed_endpoint(log_file_path)

**TASK 3**

In [78]:
# Function to detect suspicious activity
def detect_suspicious_activity(log_file, threshold=10):
    # Extract IP addresses (1st column) and status codes (9th column in zero-based indexing)
    log_data = pd.read_csv(log_file, sep=' ', header=None, usecols=[0, 8], names=['IP', 'Status'])

    # Filter for failed login attempts (HTTP status code 401)
    failed_attempts = log_data[log_data['Status'] == 401]

    # Count failed login attempts per IP address
    failed_counts = failed_attempts['IP'].value_counts()

    # Filter for IPs exceeding the threshold
    suspicious_ips = failed_counts[failed_counts > threshold]

    # Convert to DataFrame for the result
    result = suspicious_ips.reset_index()
    result.columns = ['IP Address', 'Failed Login Attempts']

    # Return the DataFrame
    return result

# Call the function with the log file path
suspicious_activity_df = detect_suspicious_activity(log_file_path, threshold=10)

# Display the result (optional)
if suspicious_activity_df.empty:
    print("\nNo suspicious activity detected.")
else:
    print("\nSuspicious Activity Detected:")
    print(suspicious_activity_df.to_string(index=False))


No suspicious activity detected.


**Task 4**

In [79]:
def save_and_display_results(requests_per_ip, most_accessed_endpoint, suspicious_activity, output_file='log_analysis_results.csv'):
    # Display the results
    print("\nRequests per IP Address:")
    print(requests_per_ip.to_string(index=False))

    print("\nMost Frequently Accessed Endpoint:")
    print(most_accessed_endpoint.to_string(index=False))

    print("\nSuspicious Activity Detected:")
    if suspicious_activity.empty:
        print("No suspicious activity detected.")
    else:
        print(suspicious_activity.to_string(index=False))

    # Write results to a single CSV file
    with open(output_file, 'w') as f:
        # Write Requests per IP section
        f.write("Requests per IP\n")
        requests_per_ip.to_csv(f, index=False)
        f.write("\n\n") 

        # Write Most Accessed Endpoint section
        f.write("Most Accessed Endpoint\n")
        most_accessed_endpoint.to_csv(f, index=False)
        f.write("\n\n") 

        # Write Suspicious Activity section
        f.write("Suspicious Activity\n")
        if suspicious_activity.empty:
            f.write("No suspicious activity detected\n")
        else:
            suspicious_activity.to_csv(f, index=False)
        f.write("\n")

    print(f"\nResults saved to {output_file}")
    
save_and_display_results(requests_per_ip_df, most_accessed_endpoint_df, suspicious_activity_df)


Requests per IP Address:
   IP Address  Request Count
  203.0.113.5              8
198.51.100.23              8
  192.168.1.1              7
     10.0.0.2              6
192.168.1.100              5

Most Frequently Accessed Endpoint:
Endpoint  Access Count
  /login            13

Suspicious Activity Detected:
No suspicious activity detected.

Results saved to log_analysis_results.csv


**For Instance**

In [59]:
# Example DataFrames
requests_per_ip_df = pd.DataFrame({'IP Address': ['192.168.1.1', '203.0.113.45', '198.51.100.23'],
                                   'Request Count': [45, 32, 28]})

most_accessed_endpoint_df = pd.DataFrame({'Endpoint': ['/home.html'], 'Access Count': [54]})

suspicious_activity_df = pd.DataFrame({'IP Address': ['192.168.1.100', '203.0.113.34'],
                                       'Failed Login Count': [56, 12]})

# Call the function
save_and_display_results(requests_per_ip_df, most_accessed_endpoint_df, suspicious_activity_df)


Requests per IP Address:
   IP Address  Request Count
  192.168.1.1             45
 203.0.113.45             32
198.51.100.23             28

Most Frequently Accessed Endpoint:
  Endpoint  Access Count
/home.html            54

Suspicious Activity Detected:
   IP Address  Failed Login Count
192.168.1.100                  56
 203.0.113.34                  12

Results saved to log_analysis_results.csv
