In [13]:
#this code works out h-point
#h-point is when you take text, rank words by decreasing frequency (of use in the text)
#h-point is when the rank equals the frequency.
#from this, it is said that those with lower ranking (highest frequencies) are likely functional words
#(e.g. grammatical purpose), and those below h-point are topical (e.g. have content/describable meaning. e.g. nouns)
#therefore, if you find a lexical/topical word at lower ranks than h-point, 
#can deduce it was "insisted on" in text (it was purposefully spoken about)

import nltk
import os
import sys
import string
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer

def hpoint(text):


    lemmatizer = WordNetLemmatizer()

    #porter = nltk.PorterStemmer()
    
    #get words (between white spaces)
    words = text.split()
        
    #strip punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    
    #strip non-alphabetics
    alpha_stripped = [word for word in stripped if word.isalpha()]
        
    #convert everything to lower case
    lower_stripped = [word.lower() for word in alpha_stripped]
    
    #lemmatize
    list_v = ([lemmatizer.lemmatize(t) for t in lower_stripped])

    #get frequency data of words
    fdist = FreqDist(list_v)
    new_list = fdist.most_common(len(fdist))

    count = 0
    
    # define punctuation
    punctuations = '''!()-–[]{};:'"“”\,,<>’../?@#$%^&*_~'''
    
    #remove punctuation (some were remaining, not sure why)
    for words in new_list:
        if words[0] in punctuations:
            print(words[0])
            new_list.remove(words)

    
    for words, nextw in zip(new_list, new_list[1:]):

        count += 1
         
        #at the point where the frequency equals rank, have reached "h-point"
        if count == words[1]:
            print('hpoint: '),
            print(count)
            #print words up to the "h-point"
            print(new_list[:count])
            break;

        #below does the same, if the h-point isn't a clean rank = frequency    
        if (count < words[1]) and (count+1 > nextw[1]):
            print('hpoint: ' ),
            alt = (((words[1]) * (count+1)) - ((nextw[1]) * count))/ (((count+1)-count) + ((words[1])-(nextw[1])))
            print (alt)
            print(new_list[:int(round(alt))])
            break;


def main():

    nltk.download('wordnet')

    file_name = sys.argv[1]
    newcorpus = PlaintextCorpusReader('data', '.*')
    files = newcorpus.fileids()

    for f in files:
        print(f)
        hpoint(newcorpus.raw(f))
        
if __name__ == "__main__":
    main()







[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\annah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1952.txt
hpoint: 
10.5
[('and', 29), ('the', 28), ('to', 25), ('of', 23), ('i', 19), ('in', 15), ('a', 14), ('all', 14), ('my', 12), ('you', 12)]
1953.txt
hpoint: 
11.333333333333334
[('and', 47), ('the', 42), ('of', 35), ('i', 24), ('to', 24), ('a', 20), ('that', 16), ('in', 15), ('is', 14), ('it', 13), ('all', 12)]
1954.txt
hpoint: 
9.333333333333334
[('the', 40), ('and', 31), ('of', 25), ('in', 21), ('to', 19), ('a', 18), ('is', 13), ('we', 11), ('that', 10)]
1955.txt
hpoint: 
11
[('the', 48), ('of', 48), ('and', 25), ('to', 24), ('a', 17), ('we', 15), ('in', 15), ('our', 13), ('all', 12), ('for', 12), ('have', 11)]
1956.txt
hpoint: 
11
[('of', 41), ('the', 40), ('and', 31), ('to', 29), ('a', 21), ('that', 19), ('is', 15), ('all', 13), ('in', 13), ('it', 13), ('i', 11)]
1957.txt
hpoint: 
12
[('the', 39), ('and', 39), ('to', 28), ('i', 27), ('of', 26), ('a', 17), ('that', 17), ('we', 15), ('my', 14), ('you', 14), ('in', 14), ('is', 12)]
1958.txt
hpoint: 
11
[('and', 44), ('to', 38), 

hpoint: 
8
[('the', 42), ('of', 26), ('a', 22), ('and', 19), ('to', 16), ('in', 13), ('for', 10), ('we', 8)]
2014.txt
hpoint: 
8
[('the', 46), ('of', 34), ('and', 29), ('a', 28), ('in', 22), ('to', 14), ('is', 10), ('war', 8)]
2015.txt
hpoint: 
9.5
[('the', 40), ('of', 34), ('to', 27), ('a', 26), ('and', 16), ('that', 13), ('christmas', 11), ('in', 10), ('for', 10), ('year', 9)]
2016.txt
hpoint: 
7
[('of', 27), ('the', 26), ('and', 23), ('a', 17), ('to', 14), ('but', 8), ('they', 7)]
2017.txt
hpoint: 
8.5
[('the', 34), ('of', 32), ('a', 29), ('and', 23), ('to', 14), ('home', 10), ('i', 9), ('in', 9)]
2018.txt
hpoint: 
7.666666666666667
[('the', 35), ('a', 29), ('of', 25), ('and', 17), ('in', 13), ('to', 12), ('it', 9), ('with', 7)]
