In [1]:
from nltk import ngrams
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import string

In [2]:
# param type: string
# return type : list
def text_segmentation(text):
    sent_tokenized = sent_tokenize(text)
    return sent_tokenized

In [4]:
# param type: string
# return type: string
def text_lowercase(text):
    return text.lower()

In [5]:
# functionality:flatten the list, converting it to one-dimension list
# param type:list[list]
# return type:list
def serialize(sentences):
    res = []
    if len(sentences) == 0:
        return []
    if len(sentences[0]) > 0: # mutiple sentences
        for sent in sentences:
            for token in sent:
                res.append(token)
    else:
        for sent in sentences:
            res.append(token)
    return res

In [6]:
# functionality: generate n-grams list
# param type: list / int
# return list
def generate_ngrams(sentences,n):
    temp = serialize(sentences)
    trigrams =  ngrams(temp, n)
    trigram_arr = []
    for gram in trigrams:
        trigram_arr.append(gram)
    return trigram_arr

In [7]:
# functionality: calculate the count of words
# param type: list
# return: FreqDist
from nltk import FreqDist
def word_counts(word_tokens):
    fdist = FreqDist(word_tokens)
    print(type(fdist))
    keys = fdist.most_common()
    return fdist

In [8]:
# functionality: remove stop-word
def stop_words_removal(token_list):
    stopwords = nltk.corpus.stopwords.words('english')
    stop_removal_words = [w for w in token_list if w not in stopwords]
    return stop_removal_words

In [9]:
# param: one sentence
# type: list
# return type: list
def remove_punctuation(sentence):
    result = []
    for token in sentence:
        if token not in string.punctuation:
            result.append(token)
    return result

In [10]:
def jaccard_similarity(x,y,technique="Ferret"):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = 1
    if technique == "Ferret":
        union_cardinality = len(set.union(*[set(x), set(y)])) #Ferret Comparison Technique(denominator is no of trigrams in two docs i.e. their union)
    else:
        union_cardinality = len(set(y)) #Containment Measure technique(denominator is no of trigrams in suspicious docs)
 
    return intersection_cardinality/float(union_cardinality)

In [11]:
original_text = '''
                In ages which have no record these islands were the home of millions of happy birds. the resort of a hundred times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, in fact, of innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer. and a store of quite another sort for an immaculate Republican government.
                '''
suspicious_text = '''
                 Long ago, when there was no written history, these islands were the home of millions of happy birds; the resort of a hundred times more millions of fishes, sea lions, and other creatures. Here lived innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer, and a store of quite another sort for an immaculate Republican government.
                 '''
origial_text_lower_case = text_lowercase(original_text)

original_sent_tokens = text_segmentation(origial_text_lower_case)
original_word_tokens = text_tokenization(original_sent_tokens) # remove punctuation, tokenize each sentence
original_trigrams =  generate_ngrams(original_word_tokens,3)

suspecious_text_lower_case = text_lowercase(suspicious_text)

suspecious_sent_tokens = text_segmentation(suspecious_text_lower_case)
suspecious_word_tokens = text_tokenization(suspecious_sent_tokens) # remove punctuation, tokenize each sentence
suspecious_trigrams =  generate_ngrams(suspecious_word_tokens,3)


ferret_trigram_similarity = jaccard_similarity(original_trigrams,suspecious_trigrams,"Ferret") #Document Similarity using Ferret Technique
containment_trigram_similarity = jaccard_similarity(original_trigrams,suspecious_trigrams,"Containment") #Document Similarity using Ferret Technique

print("Trigram similarity of documents using ferret technique is: ", ferret_trigram_similarity)
print("Trigram similarity of documents using containment technique is:", containment_trigram_similarity)



Trigram similarity of documents using ferret technique is:  0.5747126436781609
Trigram similarity of documents using containment technique is: 0.78125


In [12]:
ori_word_counts = word_counts(original_word_tokens)

TypeError: unhashable type: 'list'

In [13]:
#Preprocessing for trigram similarity method
original_text_revised = text_tokenization(text_segmentation(text_lowercase(original_text))) #Apply lowercasing,Segmentation,Tokenization as mention in research paper
suspicious_text_revised = text_tokenization(text_segmentation(text_lowercase(suspicious_text))) #Apply lowercasing,Segmentation,Tokenization as mention in research paper
print(original_text_revised)
print(original_text)
#trigram generation for both documents
n = 3
trigrams_original_text =  generate_ngrams(original_text,n,True)
trigrams_suspicious_text =  generate_ngrams(suspicious_text,n,True)

#Trigram Similarity Calculation
ferret_trigram_similarity = jaccard_similarity(trigrams_original_text,trigrams_suspicious_text,"Ferret") #Document Similarity using Ferret Technique
containment_trigram_similarity = jaccard_similarity(trigrams_original_text,trigrams_suspicious_text,"Containment") #Document Similarity using Ferret Technique

print("Trigram similarity of documents using ferret technique is: ", ferret_trigram_similarity)
print("Trigram similarity of documents using containment technique is:", containment_trigram_similarity)

[['in', 'ages', 'which', 'have', 'no', 'record', 'these', 'islands', 'were', 'the', 'home', 'of', 'millions', 'of', 'happy', 'birds'], ['the', 'resort', 'of', 'a', 'hundred', 'times', 'more', 'millions', 'of', 'fishes', 'of', 'sea', 'lions', 'and', 'other', 'creatures', 'whose', 'names', 'are', 'not', 'so', 'common', 'the', 'marine', 'residence', 'in', 'fact', 'of', 'innumerable', 'creatures', 'predestined', 'from', 'the', 'creation', 'of', 'the', 'world', 'to', 'lay', 'up', 'a', 'store', 'of', 'wealth', 'for', 'the', 'british', 'farmer'], ['and', 'a', 'store', 'of', 'quite', 'another', 'sort', 'for', 'an', 'immaculate', 'republican', 'government']]

                In ages which have no record these islands were the home of millions of happy birds. the resort of a hundred times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, in fact, of innumerable creatures predestined from the creation of the world to lay up a store of 

TypeError: generate_ngrams() takes 2 positional arguments but 3 were given

In [None]:
# POS Tagger Function
def grab_files(directory):
    txtlist = []
    import os
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                 txtlist.append(os.path.join(root, file))
    return txtlist

directory = "C:\\Users\\turto\\Documents\\plagiarism-detection\\corpus\\" # change the directory based on your file struc.
filelist = grab_files(directory)

# Pre-Processing for POS Tagging
def tagger(file):
    import string
    import nltk
    try: #takes unicode Tags
        textfile = open(file, "r", encoding="utf8")
        data = textfile.read()
        textsplit = nltk.sent_tokenize(data)
        tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
        standcorpus = [nltk.pos_tag(tokens) for tokens in tokentext]
        print(standcorpus[:1])
        textfile.close()
    except: # takes non-unicode tags 
        textfile = open(file, "r")
        data = textfile.read()
        textsplit = nltk.sent_tokenize(data)
        tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
        standcorpus = [nltk.pos_tag(tokens) for tokens in tokentext]
        print(standcorpus[:1])
        textfile.close()
    return standcorpus

for a in filelist:
    print(tagger(a))