In [13]:
%%prun
import sys, os, re, itertools
from collections import Counter


def preProcess(file):
    contents = file.read().lower()
    pattern = re.compile(r'[a-zàæèäöåÄÖÅ]+')
    words = pattern.findall(contents)
    return words


def countLetters(contents):
    contents = ' '.join(contents)
    letters_dic = {}
    for letter in set(contents): 
        letters_dic[letter] = contents.count(letter)
    letters_dic = sorted(letters_dic.items(), key=lambda x: x[1], reverse=True)[1:]
    return letters_dic


def countWords(contents):
    return len(contents)


def countUniqueWords(contents):
    unique_words = set(contents)
    return len(unique_words)


def getCommonWords(contents):
    common_words = Counter(contents)
    common_words = sorted(common_words.items(), key=lambda x: x[1], reverse=True)
    return common_words


def getFollowWords(common_words, contents):
    common_words = [a_tuple[0] for a_tuple in common_words[0:5]]
    
    copy1, copy2 = itertools.tee(contents)
    next(copy2, None)
    word_pairs = Counter((word1, word2) for word1, word2 in zip(copy1, copy2))
    
    follow_words = {}
    for word in common_words:
        word_dic = word_pairs.get((word,"king"))
        # word_dic = {pair:count for pair, count in word_pairs.items() if pair[0] == word}
        # word_dic = sorted(word_dic.items(), key=lambda x: x[1], reverse=True)[0:3]
        # follow_words[word] = [word_dic[0][0][1], word_dic[0][1], word_dic[1][0][1], word_dic[1][1], word_dic[2][0][1], word_dic[2][1]]
    print(word_dic)
    return (follow_words)    


def output(contents):
    # common_words = getCommonWords(contents)
    print("Text Statististcs for Shakespeare.txt\n")
    sep = "*" * 40 + "\n\n"
    print(sep)

    print(f'Total number of words :{countWords(contents):,}\n')
    print(sep)

    print(f'\nNumber of unique words :{countUniqueWords(contents):,}\n')
    print(sep)

    print("\nCommon Words:\n")
    print(sep)
    common_words = getCommonWords(contents)[0:5]
    for word in common_words:
        print(word[0]+ '\t' + str(word[1]) + '\n')

    print("\n\nLetter frequency:\n")
    print(sep)
    letter_counts = countLetters(contents)
    for letter_count in letter_counts:
        print(letter_count[0]+ '\t' + str(letter_count[1]) + '\n')

    print("\n\nMost common following words:\n")
    print(sep)
    follow_words = getFollowWords(common_words, contents)
    for key, value in follow_words.items():
        print(f"{key} \n\t {value[0:2]} \n\t {value[2:4]} \n\t {value[4:6]}\n")


def writeToFile(contents):
    common_words = getCommonWords(contents)
    # with open(sys.argv[2], "a", encoding="utf-8") as out:
    with open("output.txt", "w", encoding="utf-8") as out:
        out.write("Text Statististcs for Shakespeare.txt\n")
        sep = "*" * 40 + "\n\n"
        out.write(sep)

        out.write(f'Total number of words :{countWords(contents):,}\n')
        out.write(sep)

        out.write(f'\nNumber of unique words :{countUniqueWords(contents):,}\n')
        out.write(sep)

        out.write("\nCommon Words:\n")
        out.write(sep)
        common_words = getCommonWords(contents)[0:5]
        for word in common_words:
            out.write(word[0]+ '\t' + str(word[1]) + '\n')

        out.write("\n\nLetter frequency:\n")
        out.write(sep)
        letter_counts = countLetters(contents)
        for letter_count in letter_counts:
            out.write(letter_count[0]+ '\t' + str(letter_count[1]) + '\n')

        out.write("\n\nMost common following words:\n")
        out.write(sep)
        follow_words = getFollowWords(common_words, contents)
        for key, value in follow_words.items():
            out.write(f"{key} \n\t {value[0:2]} \n\t {value[2:4]} \n\t {value[4:6]}\n")


def main():
    # if os.path.isfile(sys.argv[1]):   
    if True:    
        # with open(sys.argv[1], "r", encoding="utf-8") as file:
        with open("shakespeare.txt", "r", encoding="utf-8") as file:
            contents = preProcess(file) 
            output(contents)
        if len(sys.argv) > 2:
            writeToFile(contents)

    elif len(sys.argv) < 2:
        print("No read file provided!")
        sys.exit(1)    

    elif not os.path.isfile(sys.argv[1]):
        print(f"The file {sys.argv[1]} does not exist!")
        sys.exit(1)



if __name__ == '__main__':
    main()


Text Statististcs for Shakespeare.txt

****************************************


Total number of words :989,735

****************************************



Number of unique words :25,830

****************************************



Common Words:

****************************************


the	30125

and	28444

i	23874

to	21383

of	18839



Letter frequency:

****************************************


e	480970

t	354107

o	332717

a	309408

i	268911

s	266424

n	260133

h	255742

r	251863

l	180723

d	158715

u	137354

m	117435

y	99484

w	96315

c	91697

f	86140

g	72858

b	64067

p	61506

v	40149

k	37823

x	5331

j	4905

q	3961

z	1842

è	32

æ	13

à	1



Most common following words:

****************************************


31
31


         2058553 function calls in 2.143 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    0.985    0.197    1.249    0.250 {built-in method _collections._count_elements}
        1    0.301    0.301    0.301    0.301 {method 'findall' of 're.Pattern' objects}
  1979470    0.264    0.000    0.264    0.000 <string>:41(<genexpr>)
       60    0.197    0.003    0.197    0.003 {method 'count' of 'str' objects}
        2    0.100    0.050    0.334    0.167 <string>:12(countLetters)
        2    0.085    0.043    0.085    0.043 <string>:25(countUniqueWords)
        1    0.040    0.040    0.040    0.040 {method 'lower' of 'str' objects}
        2    0.037    0.019    0.037    0.019 {method 'join' of 'str' objects}
        5    0.034    0.007    0.040    0.008 {built-in method builtins.sorted}
        1    0.023    0.023    0.030    0.030 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.021    0.021    2.142 

In [55]:
# %%time #.. Time the execution of a single statement
# %%timeit #.. Time repeated execution of a single statement for more accuracy
# %%lprun #.. Run code with the line-by-line profiler
# %%memit #.. Measure the memory use of a single statement
# %%mprun #.. Run code with the line-by-line memory profiler
# %%prun #.. Run code with the profiler


In [59]:
%magic

:
    
    1) If an input line begins with '!!', then %sx is automatically
    invoked.  That is, while::
    
      !ls
    
    causes ipython to simply issue system('ls'), typing::
    
      !!ls
    
    is a shorthand equivalent to::
    
      %sx ls
    
    2) %sx differs from %sc in that %sx automatically splits into a list,
    like '%sc -l'.  The reason for this is to make it as easy as possible
    to process line-oriented shell output via further python commands.
    %sc is meant to provide much finer control, but requires more
    typing.
    
    3) Just like %sc -l, this is a list with special attributes:
    ::
    
      .l (or .list) : value as list.
      .n (or .nlstr): value as newline-separated string.
      .s (or .spstr): value as whitespace-separated string.
    
    This is very useful when trying to use such lists as arguments to
    system commands.
%%HTML:
    Alias for `%%html`.
%%SVG:
    Alias for `%%svg`.
%%bash:
    %%bash script magic
    
    Run c