In [1]:
import numpy as np
import itertools

In [2]:
def readdata(k, fname="data.txt", report=False, min_length=1):
    C_k = []
    b = 0
    
    with open(fname, "rt", encoding='latin1') as f:
        for line in f:
            # report progress
            # print every 1000th element to reduce clutter
            if report:
                if b % 1000 == 0:  
                    print('processing bin ', b)
                b += 1
            line = line.replace('\n', '')  # remove newline symbol
            C_k = list(filter(lambda x: len(x) > min_length, line.split(' ')))
            #to save time for sentences shorter than k words
            if len(C_k) >= k:
                for itemset in itertools.combinations(C_k, k):
                    #to eliminate duplicate words which would result in smaller tuples
                    _set = frozenset(itemset)
                    if (len(_set)) == k:
                        yield _set

In [3]:
def get_itemsets(k, N, hash_size, fname, previous_hash_set=None, previous_hash_size=None, return_item_set=False, min_length=1):
    if previous_hash_size is None:
        previous_hash_size = hash_size
        
    C = {}
    L = {}
    H = np.zeros((hash_size,), dtype=np.int)

    for key in readdata(k=k, fname=fname, report=False, min_length=min_length):
        #print("\nKey", key)
        frequent_items_count = 0
        if k > 1:
            for itemset in itertools.combinations(key, k-1):
                _set = frozenset(list(itemset))
                #print(_set, itemset)
                if (hash(_set) % previous_hash_size) in previous_hash_set:
                    #print("Occurence,", _key)
                    frequent_items_count += 1
        if frequent_items_count == k or k == 1:
            if return_item_set:
                if key not in C:
                    C[key] = 1
                else:
                    C[key] += 1
            else:
                hash_cell = hash(key) % hash_size
                H[hash_cell] += 1
    #filtering
    if return_item_set:
        for key, count in C.items():
            if count >= N:
                L[key] = count
        del C
        return L
    else:
        H_good = set(np.where(H >= N)[0])
        del H
        return H_good

# Most frequent 3-itemsets for *NSF_abstract_sentences*

## 1-itemsets

In [4]:
%%time
N = 3000  # frequency threshold
min_length = 1 #to skip words whose length is 1 or less
fname = 'NSF_abstract_sentences.txt'
TopN = 10
hash_size = 100000
k=1
H = get_itemsets(k=k, N=N, hash_size=hash_size, fname=fname, previous_hash_set=None, previous_hash_size=None, return_item_set=False, min_length=min_length)
print("{} items with more than {} occurrences".format(len(H), N))

1166 items with more than 3000 occurrences


## 2-itemsets

In [5]:
%%time
hash_size2 = 1000000
k=2
H2 = get_itemsets(k=k, N=N, hash_size=hash_size2, fname=fname, previous_hash_set=H, previous_hash_size=hash_size, return_item_set=False, min_length=min_length)
print("{} items with more than {} occurrences".format(len(H2), N))
del H

9169 items with more than 3000 occurrences


## 3-itemsets

In [6]:
%%time
hash_size3 = 1000000
k=3
C = get_itemsets(k=k, N=N, hash_size=hash_size3, fname=fname, previous_hash_set=H2, previous_hash_size=hash_size2, return_item_set=True, min_length=min_length)
print("{} items with more than {} occurrences".format(len(C), N))

30614 items with more than 3000 occurrences
Wall time: 4h 11min 33s


### Top 10 most frequent itemsets

In [7]:
#TopN items
most_frequent_TopN = sorted(C.items(), key=lambda k: -k[1])[:TopN]
for k, v in most_frequent_TopN:
    print("{} occurences for: {}".format(v, ', '.join(list(k))))

5705231 occurences for: of, and, the
3318151 occurences for: the, of, to
3035874 occurences for: in, the, of
2658111 occurences for: to, the, and
2327955 occurences for: in, the, and
1947722 occurences for: to, of, and
1787502 occurences for: in, of, and
1407020 occurences for: in, the, to
1321118 occurences for: of, for, the
1320281 occurences for: will, of, the


# Most frequent 3-itemsets for *NSF_abstract_sentences_nostopwords*
## 1-itemsets

In [8]:
%%time
N = 2000  # frequency threshold
fname = 'NSF_abstract_sentences_nostopwords.txt'
hash_size = 100000
k=1
H = get_itemsets(k=k, N=N, hash_size=hash_size, fname=fname, previous_hash_set=None, previous_hash_size=None, return_item_set=False)
print("{} items with more than {} occurrences".format(len(H), N))

1214 items with more than 2000 occurrences
Wall time: 18.2 s


## 2-itemsets

In [9]:
%%time
hash_size2 = 1000000
k=2
H2 = get_itemsets(k=k, N=N, hash_size=hash_size2, fname=fname, previous_hash_set=H, previous_hash_size=hash_size, return_item_set=False)
print("{} items with more than {} occurrences".format(len(H2), N))
del H

284 items with more than 2000 occurrences
Wall time: 4min 34s


## 3-itemsets

In [10]:
%%time
hash_size3 = 1000000
k=3
C = get_itemsets(k=k, N=N, hash_size=hash_size3, fname=fname, previous_hash_set=H2, previous_hash_size=hash_size2, return_item_set=True)
print("{} items with more than {} occurrences".format(len(C), N))

29 items with more than 2000 occurrences
Wall time: 26min 30s


### Top 10 most frequent itemsets

In [11]:
''#TopN items
most_frequent_TopN = sorted(C.items(), key=lambda k: -k[1])[:TopN]
for k, v in most_frequent_TopN:
    print("{} occurences for: {}".format(v, ', '.join(list(k))))

4410 occurences for: school, students, high
4354 occurences for: science, students, mathematics
4096 occurences for: science, students, engineering
3425 occurences for: science, computer, engineering
3370 occurences for: science, education, mathematics
3238 occurences for: science, students, program
3226 occurences for: science, teachers, mathematics
3113 occurences for: division, chemistry, program
3004 occurences for: differential, partial, equations
2993 occurences for: science, school, high


# Report
## Including Stop Words vs. Excluding Stop Words
The top 10 most frequent 3-itemset of the text file which includes stop words consists of only stop words suchs as _and, the, of, in, will, in_. Whereas the list of the top 10 most frequent 3-itemset of the file without the stop words consists of other common words that are expected to be used in scientific publications in English language.
Also by comparing the number of occurrences, it is obvious that the stop words are very common. For instance the most common itemset in the complete file (of, and, the) occurs 5705231 times. On the other hand the most frequent itemset _{school, students, high}_ of the non-stop word file only occurs 4410 times.
The results are:
- **With stop words**:
 - 5705231 occurences for: of, and, the
 - 3318151 occurences for: the, of, to
 - 3035874 occurences for: in, the, of
 - 2658111 occurences for: to, the, and
 - 2327955 occurences for: in, the, and
 - 1947722 occurences for: to, of, and
 - 1787502 occurences for: in, of, and
 - 1407020 occurences for: in, the, to
 - 1321118 occurences for: of, for, the
 - 1320281 occurences for: will, of, the
- **Without stop words**:
 - 4410 occurences for: school, students, high
 - 4354 occurences for: science, students, mathematics
 - 4096 occurences for: science, students, engineering
 - 3425 occurences for: science, computer, engineering
 - 3370 occurences for: science, education, mathematics
 - 3238 occurences for: science, students, program
 - 3226 occurences for: science, teachers, mathematics
 - 3113 occurences for: division, chemistry, program
 - 3004 occurences for: differential, partial, equations
 - 2993 occurences for: science, school, high

## Effect of considering *min_length* of a word
In the first attempt I included all 1 letter words that resulted in detecting a high frequency of vriables such as _x, y, n_, etc. In this report I excluded 1 letter words by applying a filter. 
The results of setting *min_length=0*:
- **With stop words**:
 - 9838559 occurences for: y, d, z
 - 8134492 occurences for: y, d, c
 - 7543647 occurences for: d, c, z
 - 7102632 occurences for: y, d, x
 - 6629631 occurences for: d, x, z
 - 5705231 occurences for: and, of, the
 - 5545656 occurences for: d, c, x
 - 3710611 occurences for: y, d, o
 - 3588119 occurences for: y, n, d
 - 3355157 occurences for: n, z, d
- **Without stop words**:
 - 766546 occurences for: n, c, x
 - 385234 occurences for: k, c, x
 - 324834 occurences for: c, x, g
 - 301788 occurences for: c, x, p
 - 259471 occurences for: c, h, x
 - 194641 occurences for: x, c, j
 - 182833 occurences for: k, x, p
 - 172291 occurences for: c, f, x
 - 167445 occurences for: n, c, g
 - 155797 occurences for: c, x, l