In [1]:
def htable(nbuckets): # nbuckets indicates the number of buckets we want to create in our hashtable
    
    return [[] for i in range(nbuckets)] # returns empty hashtable with desired number of buckets

In [2]:
htable(10)

[[], [], [], [], [], [], [], [], [], []]

In [3]:
def hashcode(o):
    
    if type(o) == int:
        return o # hashcode for an integer is the integer itself
    if type(o) == str:
        h = 0
        for c in o:
            h = h*31 + ord(c) # sum of character unicode values
        return h
    return # return no hashcode for types other than integers and strings

In [4]:
hashcode(42)

42

In [5]:
hashcode("Michael Ruddy")

63401902953007148893

In [6]:
def htable_put(table, key, value):
    """
    Process is similar to adding a key-value pair to a dictionary.
    The type(value) can be anything.
    """

    bucket = table[hashcode(key) % len(table)] # find the appropriate bucket for our key in our hashtable

    if bucket:
        for association in bucket:
            if association[0] == key:
                bucket.remove(association) # if key is already present, remove key-value pair
                break
    bucket.append((key, value)) # add new or updated key-value pair to hashtable

In [7]:
table = htable(5)
htable_put(table, "a", "123")
htable_put(table, "b", "4")
htable_put(table, "g", ("tuple", "tuple2"))

In [8]:
def htable_get(table, key):
    
    bucket = table[hashcode(key) % len(table)] # find the appropriate bucket for our key in our hashtable

    for association in bucket:
        if association[0] == key:
            return association[1] # return the associated value for our input key
    return # returns None if key is not found in hashtable

In [9]:
htable_get(table, "a")

'123'

In [10]:
htable_get(table, "b")

'4'

In [11]:
htable_get(table, "g")

('tuple', 'tuple2')

In [12]:
# if we add new values for existing keys, they will replace the original values
htable_put(table, "a", "apple")
htable_put(table, "a", "xyz")
htable_put(table, "g", ["list", "of", "words"])

In [13]:
htable_get(table, "a")

'xyz'

In [14]:
htable_get(table, "b")

'4'

In [15]:
htable_get(table, "g")

['list', 'of', 'words']

In [16]:
def get_text(fileName):
    f = open(fileName, encoding='latin - 1')
    s = f.read()
    f.close()
    return s

In [17]:
def words(text):
    regex = re.compile('[' + re.escape(string.punctuation) + 
    '0 - 9\\r\\t\\n]')
    # delete stuff but leave at least a space to avoid clumping together
    nopunct = regex.sub(" ", text)  
    words = nopunct.split(" ")
    # ignore a, an, to, at, be, ...
    words = [w for w in words if len(w) > 2]  
    words = [w.lower() for w in words]
    # print words
    return words

In [18]:
def myhtable_create_index(files):
    
    index = htable(4011) #create empty htable with 4011 buckets

    for filename in files:
        word_list = set(words(get_text(filename))) 
        for word in word_list:
        #get the file index list which contains the word
            curr_value = htable_get(index, word)
        #if the curr_value exist, we will add the file index into the set
            if curr_value: 
                curr_value.add(files.index(filename)) 
                value = curr_value
            else:
        #if none, we will define a empty set,then add the index into the set
                value = set() 
                value.add(files.index(filename))
        #put the result back to the hash table
            htable_put(index, word, value) 
    return index

In [19]:
def myhtable_index_search(files, index, terms):

    matches = [] #create empty list

    for term in terms:
        term_matches = htable_get(index, term) 
        #if match is empty
        if not matches: 
            matches = term_matches
        #if match is not empty
        else:
            matches = set(matches).intersection(set(term_matches))
    #if matches is empty after checking, return empty list        
    if not matches: 
        return []
    #if matches has common index, return all files in matches
    filenames = [files[match] for match in matches] 
    
    return filenames

In [23]:
#file is a list contains all related file location
htable = myhtable_create_index(file) 

In [32]:
searchresult = myhtable_index_search(file, htable, ['anger','considerable'])
print(searchresult)

['slate/1/Article247_4.txt', 'slate/18/Article247_4198.txt', 'slate/31/ArticleIP_1924.txt', 'slate/30/ArticleIP_1846.txt']


In [None]:
def htable_buckets_str(table):
    """
    Return a string representing the various buckets of this table.
    The output looks like:
        0000->
        0001->
        0002->
        0003->parrt:99
        0004->
    where parrt:99 indicates an association of (parrt,99) in bucket 3.
    """
    bucket_str_list = []
    for i in range(len(table)):
        bucket_key_values = []
        for association in table[i]:
            if association and association[0] and association[1]:
                bucket_key_values.append(str(association[0]) + ':' + str(association[1]))
        bucket_str_list.append('000' + str(i) + '->' + ', '.join(bucket_key_values))
    return '\n'.join(bucket_str_list) + '\n'

In [None]:
def htable_str(table):
    """
    Return what str(table) would return for a regular Python dict
    such as {parrt:99}. The order should be in bucket order and then
    insertion order within each bucket. The insertion order is
    guaranteed when you append to the buckets in htable_put().
    """
    key_values = []

    for bucket in table:
        if bucket:
            for association in bucket:
                key = association[0]
                value = association[1]
                key_values.append(str(key) + ':' + str(value))
    return '{' + ', '.join(key_values) + '}'