In [2]:
import csv
from collections import defaultdict
#Making use of default libraries only

class LogTagger:
    def __init__(self, lookup_file):
        self.lookup = self._read_lookup_table(lookup_file)

    def _read_lookup_table(self, filename):
        
        ##Reading lookup table from CSV file
        lookup = {}
        try:
            with open(filename, mode='r', newline='') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    key = (row['dstport'].strip(), row['protocol'].strip().lower()) ##Lower casing to work for case insensitivity
                    lookup[key] = row['tag'].strip() ##using tag as the key
        except FileNotFoundError:
            print(f"Error: File {filename} not found.")
            return {}
        except Exception as e:
            print(f"Error reading {filename}: {e}")
            return {}
        return lookup

    def _generate_output(self, output_filename, tag_counts, port_protocol_counts):
        ##Tag counts and Port/Protocol combination counts
        ##generating output file
        try:
            with open(output_filename + '_tag_counts.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Tag', 'Count'])
                ##We sort using the custom key set provided by lambda function
                for tag, count in sorted(tag_counts.items(), key=lambda x: (x[1], x[0])): ##x[1] will sort ascending order of count. x[0] will sort tag based on alphabetical order in case of matching counts.
                    writer.writerow([tag, count])

            with open(output_filename + '_port_protocol_counts.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Port', 'Protocol', 'Count'])
                for key, count in sorted(port_protocol_counts.items(), key=lambda x: (int(x[0][0]), x[1])): ##x[0][0] will ensure sorting is done numerically according to port number for ease of use
                    writer.writerow([key[0], key[1], count])
        except Exception as e:
            print(f"Error generating output files: {e}")

    def apply_tags_to_logs(self, logs_filename, output_filename):
        #Applying tags to log entries based on dstport and protocol
        
        tag_counts = defaultdict(int)
        port_protocol_counts = defaultdict(int)

        try:
            with open(logs_filename, 'r', newline='') as infile, open(output_filename + '.csv', 'w', newline='') as outfile:
                #This order is based on the AWS Flow Log Record provided in the mail. Assuming the same order is followed for the sample logs. 
                fieldnames = [
                    'Version', 'AccountID', 'InterfaceID', 'SrcAddr', 'DstAddr', 
                    'SrcPort', 'DstPort', 'Protocol', 'Packets', 'Bytes', 
                    'StartTime', 'EndTime', 'Action', 'LogStatus', 'Tag'
                ]
                reader = csv.reader(infile, delimiter=' ')
                writer = csv.DictWriter(outfile, fieldnames=fieldnames)
                writer.writeheader()

                for row in reader:
                    log_entry = {fn: val for fn, val in zip(fieldnames[:-1], row)}
                    #Assuming the port numbers are standard so 6 for TCP and 17 for UDP and 1 for ICMP
                    protocol = {'6': 'tcp', '17': 'udp', '1': 'icmp'}.get(log_entry['Protocol'], 'unknown')
                    key = (log_entry['DstPort'], protocol)
                    #Unknown ports are marked as Untagged
                    tag = self.lookup.get(key, 'Untagged')
                    log_entry['Tag'] = tag
                    writer.writerow(log_entry)

                    tag_counts[tag] += 1
                    port_protocol_counts[key] += 1

            self._generate_output(output_filename, tag_counts, port_protocol_counts)

        except FileNotFoundError:
            print(f"Error: File {logs_filename} not found.")
        except Exception as e:
            print(f"Error processing logs: {e}")

if __name__ == "__main__":
    ##Using Lookup and sampleFlowLogs as plain txt files as mentioned in the requirements. We are then converting them to CSV to output it in a structured way. 
    log_tagger = LogTagger('lookup.txt')
    log_tagger.apply_tags_to_logs('sampleFlowLogs.txt', 'output')