In [31]:
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import string

In [6]:
# param type: string
# return type : list
def text_segmentation(text):
    sent_tokenized = sent_tokenize(text)
    return sent_tokenized

In [7]:
# param type: list
# reuturn type string
def text_tokenization(text):
    word_tokenized = []
    for sentence in text:
        punc_removal = remove_punctuation(word_tokenize(sentence))
        word_tokenized.append(punc_removal)
    return word_tokenized

In [8]:
# param type: string
# return type: string
def text_lowercase(text):
    return text.lower()

In [47]:
# functionality:flatten the list, converting it to one-dimension list
# param type:list[list]
# return type:list
def serialize(sentences):
    res = []
    if len(sentences) == 0:
        return []
    if len(sentences) == 1: # only one sentence
        return sentences
    for sent in sentences:
        for token in sent:
            res.append(token)
    return res

In [34]:
# functionality: generate n-grams list
# param type: list / int
# return list
def generate_ngrams(sentences,n):
    temp = serialize(sentences)
    print(temp)
    trigrams =  nltk.ngrams(temp, n)
    trigram_arr = []
    for gram in trigrams:
        trigram_arr.append(gram)
    return trigram_arr

In [11]:
# functionality: calculate the count of words
# param type: list
# return: FreqDist
from nltk import FreqDist
def word_counts(word_tokens):
    fdist = FreqDist(word_tokens)
    keys = fdist.most_common()
    return fdist

In [12]:
# functionality: remove stop-word
def stop_words_removal(token_list):
    stopwords = nltk.corpus.stopwords.words('english')
    stop_removal_words = [w for w in token_list if w not in stopwords]
    return stop_removal_words

In [13]:
# param: one sentence
# type: list
# return type: list
def remove_punctuation(sentence):
    result = []
    for token in sentence:
        if token not in string.punctuation:
            result.append(token)
    return result

In [14]:
def jaccard_similarity(x,y,technique="Ferret"):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = 1
    if technique == "Ferret":
        union_cardinality = len(set.union(*[set(x), set(y)])) #Ferret Comparison Technique(denominator is no of trigrams in two docs i.e. their union)
    else:
        union_cardinality = len(set(y)) #Containment Measure technique(denominator is no of trigrams in suspicious docs)
 
    return intersection_cardinality/float(union_cardinality)

In [15]:
def LCS(original_tokens,suspecious_tokens):
    length_o = len(original_tokens)
    length_s = len(suspecious_tokens)
    LCS = [[0 for _ in range(length_s + 1)] for _ in range(length_o + 1)]
    for i in range(1,length_o+1):
        for j in range(1,length_s+1):
            if original_tokens[i-1] == suspecious_tokens[j-1]:
                LCS[i][j] = LCS[i-1][j-1] + 1
            else:
                LCS[i][j] = max(LCS[i-1][j],LCS[i][j-1])
    return LCS[length_o][length_s]
    #if not length_o or not length_s:
        #return 0
    #elif original_tokens[length_o - 1] == suspecious_tokens[length_s - 1]:
        #return 1 + LCS(original_tokens,suspecious_tokens,length_o - 1,length_s - 1)
    #else:
        #return max(LCS(original_tokens,suspecious_tokens,length_o-1,length_s),LCS(original_tokens,suspecious_tokens,length_o,length_s-1))

X = ['aa','bc','dee']
Y = ['aa','dfa','dfa']
print(LCS(X,Y))

1


In [48]:
original_text = '''
                In ages which have no record these islands were the home of millions of happy birds. the resort of a hundred times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, in fact, of innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer. and a store of quite another sort for an immaculate Republican government.
                '''
suspicious_text = '''
                 Long ago, when there was no written history, these islands were the home of millions of happy birds; the resort of a hundred times more millions of fishes, sea lions, and other creatures. Here lived innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer, and a store of quite another sort for an immaculate Republican government.
                 '''
origial_text_lower_case = text_lowercase(original_text)

original_sent_tokens = text_segmentation(origial_text_lower_case)
original_word_tokens = text_tokenization(original_sent_tokens) # remove punctuation, tokenize each sentence
original_trigrams =  generate_ngrams(original_word_tokens,3)

suspecious_text_lower_case = text_lowercase(suspicious_text)

suspecious_sent_tokens = text_segmentation(suspecious_text_lower_case)
suspecious_word_tokens = text_tokenization(suspecious_sent_tokens) # remove punctuation, tokenize each sentence
suspecious_trigrams =  generate_ngrams(suspecious_word_tokens,3)


ferret_trigram_similarity = jaccard_similarity(original_trigrams,suspecious_trigrams,"Ferret") #Document Similarity using Ferret Technique
containment_trigram_similarity = jaccard_similarity(original_trigrams,suspecious_trigrams,"Containment") #Document Similarity using Ferret Technique

print("Trigram similarity of documents using ferret technique is: ", ferret_trigram_similarity)
print("Trigram similarity of documents using containment technique is:", containment_trigram_similarity)



[['in', 'ages', 'which', 'have', 'no', 'record', 'these', 'islands', 'were', 'the', 'home', 'of', 'millions', 'of', 'happy', 'birds'], ['the', 'resort', 'of', 'a', 'hundred', 'times', 'more', 'millions', 'of', 'fishes', 'of', 'sea', 'lions', 'and', 'other', 'creatures', 'whose', 'names', 'are', 'not', 'so', 'common', 'the', 'marine', 'residence', 'in', 'fact', 'of', 'innumerable', 'creatures', 'predestined', 'from', 'the', 'creation', 'of', 'the', 'world', 'to', 'lay', 'up', 'a', 'store', 'of', 'wealth', 'for', 'the', 'british', 'farmer'], ['and', 'a', 'store', 'of', 'quite', 'another', 'sort', 'for', 'an', 'immaculate', 'republican', 'government']]
['in', 'ages', 'which', 'have', 'no', 'record', 'these', 'islands', 'were', 'the', 'home', 'of', 'millions', 'of', 'happy', 'birds', 'the', 'resort', 'of', 'a', 'hundred', 'times', 'more', 'millions', 'of', 'fishes', 'of', 'sea', 'lions', 'and', 'other', 'creatures', 'whose', 'names', 'are', 'not', 'so', 'common', 'the', 'marine', 'residenc

In [17]:
ori_word_counts = word_counts(serialize(original_word_tokens))
for word in ori_word_counts.most_common():
    print(word)

('in', 1)


In [49]:
ori_flatten_tokens = serialize(original_word_tokens)
print(ori_flatten_tokens)
sus_flatten_tokens = serialize(suspecious_word_tokens)
res = LCS(ori_flatten_tokens,sus_flatten_tokens)
print(res)

['in', 'ages', 'which', 'have', 'no', 'record', 'these', 'islands', 'were', 'the', 'home', 'of', 'millions', 'of', 'happy', 'birds', 'the', 'resort', 'of', 'a', 'hundred', 'times', 'more', 'millions', 'of', 'fishes', 'of', 'sea', 'lions', 'and', 'other', 'creatures', 'whose', 'names', 'are', 'not', 'so', 'common', 'the', 'marine', 'residence', 'in', 'fact', 'of', 'innumerable', 'creatures', 'predestined', 'from', 'the', 'creation', 'of', 'the', 'world', 'to', 'lay', 'up', 'a', 'store', 'of', 'wealth', 'for', 'the', 'british', 'farmer', 'and', 'a', 'store', 'of', 'quite', 'another', 'sort', 'for', 'an', 'immaculate', 'republican', 'government']
58


In [1]:
# Pre-Prcoessing Part of Speech Tag
def grab_files(directory):
    txtlist = []
    import os
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                 txtlist.append(os.path.join(root, file))
    return txtlist

directory = "C:\\Users\\turto\\Documents\\plagiarism-detection\\corpus\\" # change the directory based on your file struc.
filelist = grab_files(directory)

# Pre-Processing for POS Tagging
def tagger(file):
    import string
    import nltk
    try: #takes unicode Tags
        textfile = open(file, "r", encoding="utf8")
        data = textfile.read()
        textsplit = nltk.sent_tokenize(data)
        tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
        standcorpus = [nltk.pos_tag(tokens) for tokens in tokentext]
        print(standcorpus[:1])
        textfile.close()
    except: # takes non-unicode tags 
        textfile = open(file, "r")
        data = textfile.read()
        textsplit = nltk.sent_tokenize(data)
        tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
        standcorpus = [nltk.pos_tag(tokens) for tokens in tokentext]
        print(standcorpus[:1])
        textfile.close()
    return standcorpus

for a in filelist:
    print(tagger(a))

In [1]:
# Stop Words Corpus Tagged for Pre Processing
def stopwords():
    import nltk
    fstop = open('Smart.English(1).stop', 'r')
    stoptext = fstop.read()
    fstop.close()
    stopwordsf = nltk.word_tokenize(stoptext)
    stopwordsd = nltk.corpus.stopwords.words('english')
    return {'stopwordsf': stopwordsf, 'stopwordsd': stopwordsd} # d for 'default' and f for 'from file'


print(stopwords()['stopwordsf'][:20])
print(stopwords()['stopwordsd'][:20])

['â€™s', 'a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone']
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']
