# simple approach

In [1]:
def top_k_frequent_words(dataset, stop_words_file, k):
    word_freq = {}

    with open(stop_words_file, 'r') as stop_words:
        stop_word_set = set(stop_words.read().splitlines())
    
    #stop_word_set.add('–')
    words = dataset.lower().split()

    for word in words:
        if word not in stop_word_set:
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1

    return word_freq

chunk_size = 1024 * 1024 * 1024
stop_words_file = 'dataset/stopwords'
k = 10

In [9]:
%%time

word_freq = {}

with open('dataset/data_300MB.txt', 'r') as file:
    while True:
        chunk = file.readlines(chunk_size)
        if not chunk:
            break

        chunk_word_freq = top_k_frequent_words(' '.join(chunk), stop_words_file, k)

        for word, count in chunk_word_freq.items():
            if word in word_freq:
                word_freq[word] += count
            else:
                word_freq[word] = count

sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
top_k_words = sorted_words[:10]
print(top_k_words)

[('european', 316722), ('mr', 210160), ('would', 179735), ('also', 175907), ('-', 162852), ('must', 153791), ('commission', 138407), ('president,', 125700), ('member', 124360), ('like', 108992)]
CPU times: user 10.9 s, sys: 1.79 s, total: 12.7 s
Wall time: 13 s


In [None]:
%%time

word_freq = {}

with open('dataset/data_16GB.txt', 'r') as file:
    while True:
        chunk = file.readlines(chunk_size)
        if not chunk:
            break

        chunk_word_freq = top_k_frequent_words(' '.join(chunk), stop_words_file, k)

        for word, count in chunk_word_freq.items():
            if word in word_freq:
                word_freq[word] += count
            else:
                word_freq[word] = count

sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
top_k_words = sorted_words[:10]
print(top_k_words)

# Multi-processing

In [21]:
import concurrent.futures
import collections

def process_chunk(chunk, stop_words):
    word_freq = collections.defaultdict(int)
    for line in chunk:
        words = line.split()
        for word in words:
            if word.lower() not in stop_words:
                word_freq[word] += 1
    return word_freq

def get_k_most_frequent_words(file_path, stop_words_file, k, num_threads):
    chunk_size = 1024 * 1024 * 1024  # 1GB
    word_freq = collections.defaultdict(int)

    # Read stop words from file
    with open(stop_words_file, 'r') as stop_words_file:
        stop_words = set(stop_words_file.read().split())
        stop_words.add('-')
        stop_words.add('–')

    with open(file_path, 'r') as file:
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = []
            while True:
                chunk = file.readlines(chunk_size)
                if not chunk:
                    break
                future = executor.submit(process_chunk, chunk, stop_words)
                futures.append(future)

            for future in concurrent.futures.as_completed(futures):
                chunk_freq = future.result()
                for word, count in chunk_freq.items():
                    word_freq[word] += count

    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_words[:k]

In [36]:
import time
import psutil

def get_k_most(filepath, k, n):
    
    start_time = time.time()
    
    top_words = get_k_most_frequent_words(file_path, 'dataset/stopwords', k, n)
    for word, count in top_words:
        print(f'{word},{count}')
    
    print('\n')
    end_time = time.time()
    running_time = end_time - start_time
    print(f"Running Time: {round(running_time,2)} seconds")
    
    cpu_utilization = psutil.cpu_percent()
    print(f"CPU Utilization: {cpu_utilization}%")
    
    memory_usage = psutil.Process().memory_info().rss / 1024 / 1024  # in MB
    print(f"Memory Usage: {memory_usage} MB")

In [37]:
file_path = 'dataset/small_50MB_dataset.txt'
get_k_most(file_path, 10, 4)

would,21795
US,15810
economic,14464
countries,13136
new,12460
political,12229
also,12018
one,11708
global,10935
European,10824


Running Time: 1.81 seconds
CPU Utilization: 7.8%
Memory Usage: 539.68359375 MB


In [38]:
get_k_most(file_path, 10, 8)

would,21795
US,15810
economic,14464
countries,13136
new,12460
political,12229
also,12018
one,11708
global,10935
European,10824


Running Time: 1.76 seconds
CPU Utilization: 9.8%
Memory Usage: 541.20703125 MB


In [39]:
file_path = 'dataset/data_300MB.txt'
get_k_most(file_path, 10, 4)

European,316713
Mr,210158
would,178550
also,175427
must,153717
Commission,138001
President,,125489
Member,119742
like,107437
Parliament,85550


Running Time: 11.4 seconds
CPU Utilization: 13.2%
Memory Usage: 774.92578125 MB


In [40]:
file_path = 'dataset/data_300MB.txt'
get_k_most(file_path, 10, 8)

European,316713
Mr,210158
would,178550
also,175427
must,153717
Commission,138001
President,,125489
Member,119742
like,107437
Parliament,85550


Running Time: 11.12 seconds
CPU Utilization: 11.8%
Memory Usage: 905.2265625 MB


In [41]:
file_path = 'dataset/data_2.5GB.txt'
get_k_most(file_path, 10, 4)

said,1572125
would,903267
one,755766
said.,726275
also,704422
de,620092
last,573309
two,566546
first,557474
people,557166


Running Time: 133.01 seconds
CPU Utilization: 27.0%
Memory Usage: 1955.5 MB


In [42]:
file_path = 'dataset/data_2.5GB.txt'
get_k_most(file_path, 10, 8)

said,1572125
would,903267
one,755766
said.,726275
also,704422
de,620092
last,573309
two,566546
first,557474
people,557166


Running Time: 135.42 seconds
CPU Utilization: 30.8%
Memory Usage: 2477.74609375 MB


In [43]:
file_path = 'dataset/data_16GB.txt'
get_k_most(file_path, 10, 4)

said,10397763
would,5738120
said.,4692632
one,4664992
also,4446636
two,3535784
last,3520997
first,3458376
people,3447034
new,3377098


Running Time: 819.04 seconds
CPU Utilization: 22.7%
Memory Usage: 3869.41015625 MB


In [44]:
file_path = 'dataset/data_16GB.txt'
get_k_most(file_path, 10, 8)

said,10397763
would,5738120
said.,4692632
one,4664992
also,4446636
two,3535784
last,3520997
first,3458376
people,3447034
new,3377098


Running Time: 789.34 seconds
CPU Utilization: 17.9%
Memory Usage: 3546.11328125 MB
