<a href="https://colab.research.google.com/github/Yash-005/LFN_Task/blob/main/LFN_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Method 1: Using NLP techniques**

**Anonymizing PII and creating csv file as output**

In [1]:
import csv
import re

# Regular expression to match log entries
log_pattern = re.compile(
    r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<datetime>[^\]]+)\] "(?P<method>[A-Z]+) (?P<path>[^ ]+) (?P<version>HTTP/\d\.\d)" (?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>[^"]*)" "(?P<useragent>[^"]+)" (?P<responsetime>\d+)'
)

#replacing PII with <ANONYMIZING>
def anonymize_data():
    return "<ANONYMIZED>"

# Iterating through each data entry nd checking for PII
def parse_and_anonymize_line(line):
    match = log_pattern.match(line)
    if match:

        return {
            'ip': anonymize_data(),
            'datetime': match.group('datetime'),  # Keeping datetime as it's usually not PII
            'method': match.group('method'),
            'path': anonymize_data(),  # Paths can sometimes contain PII as it can contain username
            'version': match.group('version'),
            'status': match.group('status'),
            'size': match.group('size'),
            'referrer': anonymize_data(),  # Referrer URLs can contain sensitive information
            'useragent': anonymize_data(),  # User agents can be considered as PII
            'responsetime': match.group('responsetime')
        }
    return None

# Process log file and write anonymized data to CSV
def process_log_file(input_file_path, output_file_path):
    with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as outfile:
        fieldnames = ['ip', 'datetime', 'method', 'path', 'version', 'status', 'size', 'referrer', 'useragent', 'responsetime']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for line in infile:
            parsed_line = parse_and_anonymize_line(line.strip())
            if parsed_line:
                writer.writerow(parsed_line)

input_file_path = '/content/input_logfiles.log'
output_file_path = '/content/anonymized_logsnew.csv'

# Processing
process_log_file(input_file_path, output_file_path)


**Anonymizing PII and creating csv file as output**

In [7]:
import re

# Regular expression to match log entries
log_pattern = re.compile(
    r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<datetime>[^\]]+)\] "(?P<method>[A-Z]+) (?P<path>[^ ]+) (?P<version>HTTP/\d\.\d)" (?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>[^"]*)" "(?P<useragent>[^"]+)" (?P<responsetime>\d+)'
)

#replacing PII with <ANONYMIZING>
def anonymize_data():
    return "<ANONYMIZED>"

# Iterating through each data entry nd checking for PII
def parse_and_anonymize_line(line):
    match = log_pattern.match(line)
    if match:

        anonymized_line = f'{anonymize_data()} - - [{match.group("datetime")}] "{match.group("method")} {anonymize_data()} {match.group("version")}" {match.group("status")} {match.group("size")} "{anonymize_data()}" "{anonymize_data()}" {match.group("responsetime")}'
        return anonymized_line
    return None

# Process the log file and write anonymized data to a new log file
def process_log_file(input_file_path, output_file_path):
    with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
        for line in infile:
            anonymized_line = parse_and_anonymize_line(line.strip())
            if anonymized_line:
                outfile.write(anonymized_line + '\n')


input_file_path = 'input_logfiles.log'
output_file_path = 'anonymized_logs.log'


process_log_file(input_file_path, output_file_path)


# **Method 2 : Using Presidio- a pre-existing model to anonymize the PII**

In [3]:
!pip install presidio-anonymizer


Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.354-py3-none-any.whl (31 kB)
Collecting pycryptodome>=3.10.1 (from presidio-anonymizer)
  Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycryptodome, presidio-anonymizer
Successfully installed presidio-anonymizer-2.2.354 pycryptodome-3.20.0


In [4]:
!pip install presidio-analyzer presidio-anonymizer

Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.354-py3-none-any.whl (92 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/92.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m81.9/92.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-8.13.34-py2.py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer

In [6]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import csv

# Initializing Presidio Analyzer and Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Function to anonymize a single line of text
def anonymize_text(text):
    analysis_results = analyzer.analyze(text=text, language='en')
    anonymized_results = anonymizer.anonymize(
        text=text,
        analyzer_results=analysis_results,
        operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}), "IP_ADDRESS": OperatorConfig("replace", {"new_value": "<IP-ANONYMIZED>"})}
    )
    return anonymized_results.text

# Read the log file, anonymize content, and write to a new CSV file
def process_log_file(input_file_name, output_file_name):
    with open(input_file_name, 'r') as infile:
        lines = infile.readlines()

    with open(output_file_name, 'w', newline='') as outfile:
        writer = csv.writer(outfile)

        writer.writerow(['Anonymized Log Entry'])

        for line in lines:
            anonymized_line = anonymize_text(line.strip())
            writer.writerow([anonymized_line])


input_file_path = '/content/input_logfiles.log'
output_file_path = '/content/anonymized_logs_presidio.csv'

process_log_file(input_file_path, output_file_path)

