In [62]:
from nltk import ngrams
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

#Different plagiarism examples can be tried from: https://www.bowdoin.edu/dean-of-students/judicial-board/academic-honesty-and-plagiarism/examples.html#Direct
original_text = '''
                In ages which have no record these islands were the home of millions of happy birds, the resort of a hundred times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, in fact, of innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer, and a store of quite another sort for an immaculate Republican government.
                '''
suspicious_text = '''
                 Long ago, when there was no written history, these islands were the home of millions of happy birds; the resort of a hundred times more millions of fishes, sea lions, and other creatures. Here lived innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer, and a store of quite another sort for an immaculate Republican government.
                 '''
"""
This section contains text preprocessing functions later required by plagiarism detection technique
"""
def text_segmentation(text):
    sent_tokenized = sent_tokenize(text)
    return sent_tokenized

def text_tokenization(text):
    word_tokenized = []
    for sentence in text:
        word_tokenized.append(word_tokenize(sentence))
    return ','.join([str(x) for x in word_tokenized])
    
def text_lowercase(text):
    return text.lower()
    
#print(text_tokenization(text_segmentation(suspicious_text)))


'''
Following block represents all the functionality corresponding to trigram similarity technique
'''

'''
Utility Functions
'''

'''
This function generates ngrams from a tokenized/non-tokenized text
It accepts text, ngram(n=3 for trigram), and boolean tokenized as parameter
'''
def generate_ngrams(text,n,tokenized=True):
    trigram_arr = []
    if tokenized == False:
        text = text.split() #Basic tokenization of text if not already tokenized
    trigrams =  ngrams(text, 3)
    for gram in trigrams:
        #print(grams)
        trigram_arr.append(gram)
    return trigram_arr

'''
This function compares two sets to generate their jaccard similarity score.
It accepts two sets and a technique(Ferret or Containment) as parameter.
'''
def jaccard_similarity(x,y,technique="Ferret"):
 intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 union_cardinality = 1
 if technique == "Ferret":
     union_cardinality = len(set.union(*[set(x), set(y)])) #Ferret Comparison Technique(denominator is no of trigrams in two docs i.e. their union)
 else:
     union_cardinality = len(set(y)) #Containment Measure technique(denominator is no of trigrams in suspicious docs)
 
 return intersection_cardinality/float(union_cardinality)


#Preprocessing for trigram similarity method
original_text = text_tokenization(text_segmentation(text_lowercase(original_text))) #Apply lowercasing,Segmentation,Tokenization as mention in research paper
suspicious_text = text_tokenization(text_segmentation(text_lowercase(suspicious_text))) #Apply lowercasing,Segmentation,Tokenization as mention in research paper

#trigram generation for both documents
n = 3
trigrams_original_text =  generate_ngrams(original_text,n,True)
trigrams_suspicious_text =  generate_ngrams(suspicious_text,n,True)

#Trigram Similarity Calculation
ferret_trigram_similarity = jaccard_similarity(trigrams_original_text,trigrams_suspicious_text,"Ferret") #Document Similarity using Ferret Technique
containment_trigram_similarity = jaccard_similarity(trigrams_original_text,trigrams_suspicious_text,"Containment") #Document Similarity using Ferret Technique

print("Trigram similarity of documents using ferret technique is: ", ferret_trigram_similarity)
print("Trigram similarity of documents using containment technique is:", containment_trigram_similarity)



Trigram similarity of documents using ferret technique is:  0.7425742574257426
Trigram similarity of documents using containment technique is: 0.8754863813229572
