In this exercise, we will use the ngrams toolset of the Natural Language ToolKit (NLTK) to extract N-gram features from "mkdir.exe". We also find the 50 most frequently repeated N-grams in the file.

In [1]:
#to install NLTK, uncomment and run the following line:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.0/774.0 KB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 KB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm, regex, joblib, nltk
Successfully inst

In [2]:
import collections
from nltk import ngrams
file = "mkdir.exe"

In [3]:
#Reads and returns the byte code from the file
def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
    return data

#Generates n-grams from the byte code
def byteSequenceToNgrams(byteSequence, n):
    Ngrams = ngrams(byteSequence, n)
    return list(Ngrams)
    
#Generates n-grams from binary file,
# and returns the frequency of each n-gram
def extractNgramCounts(file, N):
    fileByteSequence = readFile(file)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

In [4]:
extractedNgrams = extractNgramCounts(file, 3)

In [5]:
extractedNgrams

Counter({(0, 0, 0): 3232,
         (255, 255, 255): 297,
         (144, 144, 144): 261,
         (0, 0, 139): 176,
         (0, 0, 137): 156,
         (199, 4, 36): 114,
         (1, 0, 0): 112,
         (255, 255, 139): 112,
         (137, 68, 36): 104,
         (137, 84, 36): 102,
         (255, 255, 137): 102,
         (66, 64, 0): 99,
         (137, 76, 36): 98,
         (64, 0, 0): 91,
         (64, 0, 137): 87,
         (36, 4, 137): 79,
         (137, 4, 36): 78,
         (36, 8, 137): 78,
         (0, 0, 133): 78,
         (0, 133, 192): 75,
         (145, 64, 0): 75,
         (254, 255, 255): 73,
         (0, 0, 66): 73,
         (0, 66, 64): 71,
         (4, 36, 232): 67,
         (0, 144, 144): 64,
         (160, 0, 0): 63,
         (32, 37, 115): 62,
         (64, 0, 144): 59,
         (144, 255, 37): 58,
         (144, 144, 255): 58,
         (2, 0, 0): 57,
         (85, 137, 229): 57,
         (0, 0, 199): 57,
         (137, 52, 36): 57,
         (0, 0, 160): 56,
        

In [6]:
#print the top 50 most common n-grams
print(extractedNgrams.most_common(50))

[((0, 0, 0), 3232), ((255, 255, 255), 297), ((144, 144, 144), 261), ((0, 0, 139), 176), ((0, 0, 137), 156), ((199, 4, 36), 114), ((1, 0, 0), 112), ((255, 255, 139), 112), ((137, 68, 36), 104), ((137, 84, 36), 102), ((255, 255, 137), 102), ((66, 64, 0), 99), ((137, 76, 36), 98), ((64, 0, 0), 91), ((64, 0, 137), 87), ((36, 4, 137), 79), ((137, 4, 36), 78), ((36, 8, 137), 78), ((0, 0, 133), 78), ((0, 133, 192), 75), ((145, 64, 0), 75), ((254, 255, 255), 73), ((0, 0, 66), 73), ((0, 66, 64), 71), ((4, 36, 232), 67), ((0, 144, 144), 64), ((160, 0, 0), 63), ((32, 37, 115), 62), ((64, 0, 144), 59), ((144, 255, 37), 58), ((144, 144, 255), 58), ((2, 0, 0), 57), ((85, 137, 229), 57), ((0, 0, 199), 57), ((137, 52, 36), 57), ((0, 0, 160), 56), ((163, 0, 0), 55), ((137, 92, 36), 55), ((253, 255, 255), 53), ((36, 4, 232), 52), ((0, 160, 0), 50), ((0, 0, 141), 50), ((137, 116, 36), 48), ((137, 124, 36), 47), ((52, 36, 232), 47), ((0, 0, 131), 46), ((84, 36, 4), 45), ((64, 0, 139), 44), ((0, 0, 232), 4

**Exercise:** Find the largest value of N for which there exists at least one N-gram with with a frequency of 3 or more. 

In [9]:
# Your code
# Function to find the largest N
def findLargestN(file):
    N = 1  # Start with 1-grams
    while True:
        extractedNgrams = extractNgramCounts(file, N)
        # Check if any N-gram has a frequency of 3 or more
        if any(count >= 3 for count in extractedNgrams.values()):
            N += 1  # Increase N if the condition is met
        else:
            return N - 1  # Return the largest N that met the condition


# Correct usage
file = "mkdir.exe"  # Specify your file path here
largest_N = findLargestN(file)
print(f"The largest value of N for which there exists at least one N-gram with a frequency of 3 or more is: {largest_N}")


The largest value of N for which there exists at least one N-gram with a frequency of 3 or more is: 905
