<a href="https://colab.research.google.com/github/anasharma7/CYBERSEC/blob/main/dnsproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
from collections import Counter
import copy

# --- Function Definitions ---

def mapped_ip(ip: str, dns_log: list) -> list:
    """
    Finds domains in a DNS log that resolve to a specific IP address.

    Args:
        ip (str): The IP address to search for.
        dns_log (list): The list of DNS log entries.

    Returns:
        list: A list of (domain, count) tuples.
              Returns an empty list [] if no domains are found.
    """
    # Find all domain queries where the answer contains the specified IP.
    matches = [
        entry['query'] for entry in dns_log
        if 'answers' in entry and ip in entry['answers'].split(',')
    ]

    # If the list of matches is empty, return the specified null list.
    if not matches:
        return []

    # Otherwise, count occurrences and return the list of tuples.
    return list(Counter(matches).items())

def enrich_dns_log(dns_log: list, top_domains: set) -> list:
    """
    Enriches each log entry with domain frequency and 'Top 1M' status.

    Args:
        dns_log (list): The original list of DNS log entries.
        top_domains (set): A set of popular domains for checking 'Top 1M' status.

    Returns:
        list: The enriched list of DNS log entries.
    """
    # Work on a copy to keep the original data unchanged.
    enriched_log = copy.deepcopy(dns_log)

    # Calculate the frequency of all queries in the entire log.
    frequency_counts = Counter(entry['query'] for entry in enriched_log)

    # Add the two new attributes to each entry in the log.
    for entry in enriched_log:
        domain = entry['query']
        entry['frequency'] = frequency_counts.get(domain, 0)
        entry['is_top_1m'] = domain in top_domains

    return enriched_log

# --- Main Execution Block ---

# 1. Configuration
FILE_PATH = 'dnslog.json'
# Using the IP addresses from "problem 2" as the required test data.
IPS_TO_TEST = ['137.245.120.50', '99.84.181.52', '172.20.56.90', '104.244.42.1']
# A sample set to simulate a "Top 1M" list for the enrichment task.
POPULAR_DOMAINS_SAMPLE = {'www.google.com', 'www.amazon.com', 'www.twitter.com'}

# 2. Load Data
print(f"--- Loading and parsing {FILE_PATH} ---")
try:
    with open(FILE_PATH, 'r') as f:
        # This reads the file line by line, cleaning and parsing JSON safely.
        dns_log_data = [json.loads(line.strip().rstrip(',')) for line in f if line.strip()]
    print(f"Successfully loaded {len(dns_log_data)} log entries.")
except FileNotFoundError:
    print(f"ERROR: '{FILE_PATH}' not found. Please ensure it is uploaded.")
    dns_log_data = []

# 3. Run Both Tasks
if dns_log_data:
    # Execute the first task as requested.
    print("\n--- Task 1: Find Domain Names Mapped to Test IPs ---")
    for ip_address in IPS_TO_TEST:
        result = mapped_ip(ip_address, dns_log_data)
        print(f"[*] Checking IP: {ip_address}")
        # The 'result' variable will correctly be an empty list [] if nothing is found.
        print(f"  Result: {result}")

    # Execute the second task as requested.
    print("\n--- Task 2: Enrich DNS Log ---")
    enriched_log_data = enrich_dns_log(dns_log_data, POPULAR_DOMAINS_SAMPLE)
    print("Log enrichment complete.")
    print("  Showing the first 3 entries of the enriched log as a sample:")
    print(json.dumps(enriched_log_data[:3], indent=2))

--- Loading and parsing dnslog.json ---
Successfully loaded 29368 log entries.

--- Task 1: Find Domain Names Mapped to Test IPs ---
[*] Checking IP: 137.245.120.50
  Result: []
[*] Checking IP: 99.84.181.52
  Result: []
[*] Checking IP: 172.20.56.90
  Result: []
[*] Checking IP: 104.244.42.1
  Result: [('www.twitter.com', 1583)]

--- Task 2: Enrich DNS Log ---
Log enrichment complete.
  Showing the first 3 entries of the enriched log as a sample:
[
  {
    "Unnamed: 0": 16583,
    "ts": "2024-04-07 12:00:00",
    "id.orig_h": "10.0.1.11",
    "id.orig_p": 55555,
    "id.resp_h": "10.0.0.10",
    "id.resp_p": 53,
    "proto": "udp",
    "query": "9.a.6.e.f.9.c.c.a.8.5.e.5.b.5.8.0.0.0.0.0.0.0.0.0.0.0.0.0.8.e.f.ip6.arpa",
    "qtype": "12",
    "qtype_name": "PTR",
    "rcode": "3",
    "answers": "-",
    "frequency": 7,
    "is_top_1m": false
  },
  {
    "Unnamed: 0": 16585,
    "ts": "2024-04-07 12:00:00",
    "id.orig_h": "10.0.1.11",
    "id.orig_p": 47742,
    "id.resp_h": "10.0.0