In [None]:
#read the text file
#tokenize the words 
#remove stop words
#count the words using a dictionary  


In [4]:
from collections import Counter
import os

In [None]:
def read_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    return content 

def remove_stop_words(content, stop_words):
    words = content.lower().split()
    return [word for word in words if word not in stop_words]

def find_top_k_words(words, k):
    #Counter object uses hash table
    word_counts = Counter(words)
    return word_counts.most_common(k)

def main():
    file_path = '/Users/eviljimmy/Desktop/USF/2023_Summer/hw/hw1/small_50MB_dataset.txt'
    stop_words = read_file("/Users/eviljimmy/Desktop/USF/2023_Summer/hw/hw1/stop_words/NLTK's list of english stopwords")
    k = 10 

    content = read_file(file_path)
    words = remove_stop_words(content, stop_words)
    top_words = find_top_k_words(words,k)

    print(f"Top {k} words (excluding stop words):")
    for word, count in top_words:
        print(f"{word: {count}}")

if __name__ == '__main__':
    main()

In [11]:
class TextAnalyze:
    def __init__(self, stop_words_file):
        # load the stop_words file and initialize count
        self.stop_words = self.load_stop_words(stop_words_file)
        self.word_count = {}
    
    def load_stop_words(self, stop_words_file):
        # open and load the stop words file
        with open(stop_words_file, 'r') as file:
            stop_words = file.read().splitlines()
        return stop_words
    
    def process_text_file(self, file_path):
        with open(file_path, 'r') as file:
            for line in file:
                self.process_line(line)
    
    def process_line(self, line):
        # strip and set the texts to lowercase
        words = line.strip().split()
        for word in words:
            if word.isalpha():      #check if the word is alphabetic 
                if word.lower() not in self.stop_words:
                    self.word_count[word] = self.word_count.get(word, 0) + 1

    def get_top_words(self, k):
        # sort and get the top k words
        sorted_words = sorted(self.word_count.items(), key = lambda x: x[1], reverse=True)
        return sorted_words[:k]

In [12]:
# test 
analyzer = TextAnalyze("stop_words/NLTK's list of english stopwords")
analyzer.process_text_file('small_50MB_dataset.txt')
top_words = analyzer.get_top_words(10)

#print top words
for word, count in top_words:
    print(f'{word},{count}')

would,21795
US,15810
economic,14464
countries,13136
new,12460
political,12229
also,12018
one,11708
global,10935
European,10824


In [16]:
#analyze performance 
import time
import psutil

def analyze_performance(filepath):
    # Start measuring time
    start_time = time.time()

    # get top k words 
    analyzer = TextAnalyze("stop_words/NLTK's list of english stopwords")
    analyzer.process_text_file(filepath)
    top_words = analyzer.get_top_words(10)

    #print top words
    for word, count in top_words:
        print(f'{word},{count}')

    # Stop measuring time
    end_time = time.time()
    running_time = end_time - start_time
    print(f"Running Time: {running_time} seconds")

    # Get CPU utilization
    cpu_utilization = psutil.cpu_percent()
    print(f"CPU Utilization: {cpu_utilization}%")

    # Get memory usage
    memory_usage = psutil.Process().memory_info().rss / 1024 / 1024  # in MB
    print(f"Memory Usage: {memory_usage} MB")

In [14]:
# Call the performance analysis function
filepath = 'small_50MB_dataset.txt'
analyze_performance(filepath)

would,21795
US,15810
economic,14464
countries,13136
new,12460
political,12229
also,12018
one,11708
global,10935
European,10824
Running Time: 7.834611892700195 seconds
CPU Utilization: 11.5%
Memory Usage: 130.60546875 MB


In [17]:
# 300MB
filepath = 'data_300MB.txt'
analyze_performance(filepath)

European,316713
Mr,210158
would,178550
also,175427
must,153717
Commission,138001
Member,119742
like,107437
Parliament,85550
one,84992
Running Time: 56.86176419258118 seconds
CPU Utilization: 14.4%
Memory Usage: 44.8125 MB


In [18]:
# 2.5GB
filepath = 'data_2.5GB.txt'
analyze_performance(filepath)

said,1572125
would,903267
one,755766
also,704422
de,620092
last,573309
two,566546
first,557474
people,557166
new,546573
Running Time: 486.91321206092834 seconds
CPU Utilization: 26.1%
Memory Usage: 196.24609375 MB


In [19]:
# 16GB
filepath = 'data_16GB.txt'
analyze_performance(filepath)

said,10397763
would,5738120
one,4664992
also,4446636
two,3535784
last,3520997
first,3458376
people,3447034
new,3377098
could,3286270
Running Time: 3063.7685120105743 seconds
CPU Utilization: 33.4%
Memory Usage: 283.8359375 MB


In [25]:
import threading

class WordCounterThread(threading.Thread):
    def __init__(self, partition):
        threading.Thread.__init__(self)
        self.partition = partition
        self.word_count = {}

    def run(self):
        # Process the partition and count the words
        with open(self.partition, 'r') as file:
            for line in file:
                words = line.strip().split()
                for word in words:
                    self.word_count[word] = self.word_count.get(word, 0) + 1

    def get_word_count(self):
        return self.word_count

def create_partitions(file_path, num_partitions):
    partitions = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        total_lines = len(lines)
        lines_per_partition = total_lines // num_partitions

        start = 0
        for i in range(num_partitions - 1):
            end = start + lines_per_partition
            partition = lines[start:end]
            partitions.append(partition)
            start = end

        # Last partition may have remaining lines
        last_partition = lines[start:]
        partitions.append(last_partition)

    return partitions

def count_words_parallel(file_path, num_threads):
    # Divide the text file into partitions
    # Assuming the partitions are stored in a list called 'partitions'
    partitions = create_partitions(file_path, num_threads)

    # Create and start the threads
    threads = []
    for i in range(num_threads):
        thread = WordCounterThread(partitions[i])
        threads.append(thread)
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

    # Merge the word counts from each thread
    merged_word_count = {}
    for thread in threads:
        word_count = thread.get_word_count()
        for word, count in word_count.items():
            merged_word_count[word] = merged_word_count.get(word, 0) + count

    return merged_word_count

# Usage example
file_path = 'small_50MB_dataset.txt'
num_threads = 4
word_count = count_words_parallel(file_path, num_threads)
print(word_count)


Exception in thread Exception in thread Thread-12:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Thread-13:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/var/folders/5z/d3h21k595t736_39rxwtb4rw0000gn/T/ipykernel_42340/1396404682.py", line 11, in run
Exception in thread Thread-14:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Thread-15:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/var/folders/5z/d3h21k595t736_39rxwtb4rw0000gn/T/ipykernel_42340/1396404682.py", line 11, in run
    self.run()
  File "/var/folders/5z/d3h21k595t736_39rxwtb4rw0000gn/T/ipykernel_42340/1396404682.py", 

{}


In [23]:
# testing the run time in multiple threads
file_path = 'small_50MB_dataset.txt'
num_threads = 8
word_count = count_words_parallel(file_path, num_threads)
print(word_count)

Exception in thread Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Thread-5:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Thread-6:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Thread-7:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Thread-8:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Thread-9:
Traceback (most recent call last):
  File "/Users/eviljimmy/opt/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Ex

{}


In [None]:
# using concurent hashtable 