In [1]:
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import string

In [2]:
# param type: string
# return type : list
def text_segmentation(text):
    sent_tokenized = sent_tokenize(text)
    return sent_tokenized

In [3]:
# param type: list
# reuturn type string
def text_tokenization(text):
    word_tokenized = []
    for sentence in text:
        punc_removal = remove_punctuation(word_tokenize(sentence))
        word_tokenized.append(punc_removal)
    return word_tokenized

In [4]:
# param type: string
# return type: string
def text_lowercase(text):
    return text.lower()

In [138]:
# functionality:flatten the list, converting it to one-dimension list
# param type:list[list]
# return type:list
def serialize(sentences):
    res = []
    if len(sentences) == 0:
        return []
    if len(sentences) == 1: # only one sentence
        return sentences
    for sent in sentences:
        for token in sent:
            res.append(token)
    return res

In [6]:
# functionality: generate n-grams list
# param type: list / int
# return list
def generate_ngrams(sentences,n):
    temp = serialize(sentences)
    #print(temp)
    trigrams =  nltk.ngrams(temp, n)
    trigram_arr = []
    for gram in trigrams:
        trigram_arr.append(gram)
    return trigram_arr

In [7]:
# functionality: calculate the count of words
# param type: list
# return: FreqDist
from nltk import FreqDist
def word_counts(word_tokens):
    fdist = FreqDist(word_tokens)
    keys = fdist.most_common()
    return fdist

In [8]:
# functionality: remove stop-word
def stop_words_removal(token_list):
    stopwords = nltk.corpus.stopwords.words('english')
    stop_removal_words = [w for w in token_list if w not in stopwords]
    return stop_removal_words

In [9]:
# param: one sentence
# type: list
# return type: list
def remove_punctuation(sentence):
    result = []
    for token in sentence:
        if token not in string.punctuation:
            result.append(token)
    return result

In [182]:
# param: list[list],list[list]
def LCS_document(ori_list,sus_list):
    feature = {}
    length_s = len(sus_list)
    length_o = len(ori_list)
    sum_ =0
    lcs_len = []
    sus_max = []# to find the max for a given specious sentence
    sus_num = []
    for idx1 in range(length_s):
        sus_num = []
        for idx2 in range(length_o):
            nums = LCS(ori_list[idx2],sus_list[idx1])
            sus_num.append(nums)
            sum_ +=nums
            lcs_len.append(nums)
        if len(sus_num) > 1:
            temp = max(sus_num)
            sus_max.append(temp)
    if len(lcs_len) > 1:
        max_len = max(lcs_len)
    elif len(lcs_len) == 1:
        max_len = lcs_len[0]
    else:
        max_len = 0
    sus_sum_nol = 0
    if len(sus_max) > 1:
        sus_sum_nol = sum(sus_max) / length_s
    feature['longest matching sequence'] = max_len
    feature['sum of longest matching sequence normalized by the number of suspecious sentence'] = round(sus_sum_nol,4)
    feature['average of matching sequence'] = round(sum_ / (length_s*length_o),4)
    return feature

In [144]:
def LCS_sentence(original_tokens,suspecious_tokens):
    length_o = len(original_tokens)
    length_s = len(suspecious_tokens)
    LCS = [[0 for _ in range(length_s + 1)] for _ in range(length_o + 1)]
    res = []
    for i in range(1,length_o+1):
        for j in range(1,length_s+1):
            if original_tokens[i-1] == suspecious_tokens[j-1]:
                LCS[i][j] = LCS[i-1][j-1] + 1
            else:
                LCS[i][j] = max(LCS[i-1][j],LCS[i][j-1])
    return LCS[length_o][length_s]

X = ['aa','bc','dee']
Y = ['aa','dfa','dfa','dee']
print(LCS_sentence(X,Y))

2


In [10]:
def jaccard_similarity(x,y,technique="Ferret"):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = 1
    if technique == "Ferret":
        union_cardinality = len(set.union(*[set(x), set(y)])) #Ferret Comparison Technique(denominator is no of trigrams in two docs i.e. their union)
    else:
        union_cardinality = len(set(y)) #Containment Measure technique(denominator is no of trigrams in suspicious docs)
 
    return intersection_cardinality/float(union_cardinality)

In [72]:
# functionality: the number of sentences in a passage
# param: list[list]
# return int
def sentence_length(sentences):
    return len(sentences)

In [70]:
# test code:
original_text = '''
                In ages which have no record these islands were the home of millions of happy birds. the resort of a hundred times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, in fact, of innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer. and a store of quite another sort for an immaculate Republican government.
                '''
suspicious_text = '''
                 Long ago, when there was no written history, these islands were the home of millions of happy birds; the resort of a hundred times more millions of fishes, sea lions, and other creatures. Here lived innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer, and a store of quite another sort for an immaculate Republican government.
                 '''
origial_text_lower_case = text_lowercase(original_text)

original_sent_tokens = text_segmentation(origial_text_lower_case)
original_word_tokens = text_tokenization(original_sent_tokens) # remove punctuation, tokenize each sentence
original_trigrams =  generate_ngrams(original_word_tokens,3)

suspecious_text_lower_case = text_lowercase(suspicious_text)

suspecious_sent_tokens = text_segmentation(suspecious_text_lower_case)
suspecious_word_tokens = text_tokenization(suspecious_sent_tokens) # remove punctuation, tokenize each sentence
suspecious_trigrams =  generate_ngrams(suspecious_word_tokens,3)
sent_length = sentence_length(suspecious_sent_tokens)

ferret_trigram_similarity = jaccard_similarity(original_trigrams,suspecious_trigrams,"Ferret") #Document Similarity using Ferret Technique
containment_trigram_similarity = jaccard_similarity(original_trigrams,suspecious_trigrams,"Containment") #Document Similarity using Ferret Technique

print("Trigram similarity of documents using ferret technique is: ", ferret_trigram_similarity)
print("Trigram similarity of documents using containment technique is:", containment_trigram_similarity)



Trigram similarity of documents using ferret technique is:  0.5747126436781609
Trigram similarity of documents using containment technique is: 0.78125


In [55]:
ori_word_counts = word_counts(serialize(original_word_tokens))
for word in ori_word_counts.most_common():
    print(word)

('of', 9)
('the', 6)
('a', 3)
('in', 2)
('millions', 2)
('and', 2)
('creatures', 2)
('store', 2)
('for', 2)
('ages', 1)
('which', 1)
('have', 1)
('no', 1)
('record', 1)
('these', 1)
('islands', 1)
('were', 1)
('home', 1)
('happy', 1)
('birds', 1)
('resort', 1)
('hundred', 1)
('times', 1)
('more', 1)
('fishes', 1)
('sea', 1)
('lions', 1)
('other', 1)
('whose', 1)
('names', 1)
('are', 1)
('not', 1)
('so', 1)
('common', 1)
('marine', 1)
('residence', 1)
('fact', 1)
('innumerable', 1)
('predestined', 1)
('from', 1)
('creation', 1)
('world', 1)
('to', 1)
('lay', 1)
('up', 1)
('wealth', 1)
('british', 1)
('farmer', 1)
('quite', 1)
('another', 1)
('sort', 1)
('an', 1)
('immaculate', 1)
('republican', 1)
('government', 1)


In [73]:
ori_flatten_tokens = serialize(original_word_tokens)
print(ori_flatten_tokens)
sus_flatten_tokens = serialize(suspecious_word_tokens)
num,res = LCS(ori_flatten_tokens,sus_flatten_tokens)
print(sent_length)
normallization_LCS = num / sent_length
print(normallization_LCS)

['in', 'ages', 'which', 'have', 'no', 'record', 'these', 'islands', 'were', 'the', 'home', 'of', 'millions', 'of', 'happy', 'birds', 'the', 'resort', 'of', 'a', 'hundred', 'times', 'more', 'millions', 'of', 'fishes', 'of', 'sea', 'lions', 'and', 'other', 'creatures', 'whose', 'names', 'are', 'not', 'so', 'common', 'the', 'marine', 'residence', 'in', 'fact', 'of', 'innumerable', 'creatures', 'predestined', 'from', 'the', 'creation', 'of', 'the', 'world', 'to', 'lay', 'up', 'a', 'store', 'of', 'wealth', 'for', 'the', 'british', 'farmer', 'and', 'a', 'store', 'of', 'quite', 'another', 'sort', 'for', 'an', 'immaculate', 'republican', 'government']
2
29.0


In [105]:
# functionality: read single file from corpus
# param: file name
# return type : str
import codecs
def readFile(filename):
    with codecs.open('C:\\File\\CIS-668\\Project\\corpus-final09\\'+filename,'r', encoding='utf-8',
                 errors='ignore') as f:
         str = f.read() 
    return str
#readFile('orig_taska.txt')

'In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula.\r\n\r\nThe new classes, known as derived classes, take over (or inherit) attributes and behavior of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification.\r\n\r\nInheritance provides the support for representation by categorization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization (what is known about specific entities is applied to a wider group given a belongs relation can be established) and cognitive economy (less information needs to be stored about each specific entity, only its particularities).\r\n\r\nInheritance is also sometimes called generalization, because t

In [117]:
## read origial files from folder
from collections import defaultdict
def read_ori_files():
    files = defaultdict(str)   ##original files
    prefix = 'orig_task'
    for i in range(5):
        text = readFile(prefix+chr(97+i)+'.txt')
        #text = text.decode('utf8')
        if text:
            files[prefix+chr(97+i)+'.txt'] = text
    return files
#read_ori_files()

In [169]:
# functionality: read the excel sheet and save the data in a Pandas object
# return type: dataframe
import pandas as pd
def readXLS():
    io = r'C:\\File\\CIS-668\\Project\\corpus-final09.xls'
    df = pd.read_excel(io,sheet_name = 1)
    df = df.drop(['Group','Person','Native English','Knowledge','Difficulty'],axis=1)
    return df
df = readXLS()
a = df[df['Task']=='b']
for index,row in a.iterrows():
    print(row['File'])
#print(type(df.iloc[0]))

g0pA_taskb.txt
g0pB_taskb.txt
g0pC_taskb.txt
g0pD_taskb.txt
g0pE_taskb.txt
g1pA_taskb.txt
g1pB_taskb.txt
g1pD_taskb.txt
g2pA_taskb.txt
g2pB_taskb.txt
g2pC_taskb.txt
g2pE_taskb.txt
g3pA_taskb.txt
g3pB_taskb.txt
g3pC_taskb.txt
g4pB_taskb.txt
g4pC_taskb.txt
g4pD_taskb.txt
g4pE_taskb.txt


In [118]:
# functionality: read suspicious file
# return type: dict, key is file name, value is the text of this file
from collections import defaultdict

def read_sus_files():
    df = readXLS()
    sus_dict = defaultdict(str)   # suspecious files
    for idx in range(len(df)):
        filename = df.iloc[idx][0]
        text = readFile(filename)
        sus_dict[filename] = text
    return sus_dict
#read_sus_files()

In [122]:
# funtionality: to preprocess the raw text
# param: str
# return type: list
def pre_processing(text):
    text = text_lowercase(text)
    #print(text)
    seg = text_segmentation(text)
    token = text_tokenization(seg)
    return token

In [183]:
# main program: extract text and features from file, and save it into dicitonary. 
# then 
import string

ori_dict = read_ori_files()
sus_dict = read_sus_files()
sus_df = readXLS()
feature_set = []
for idx1 in range(5):
    token_ori = pre_processing(ori_dict['orig_task'+chr(97+idx1)+'.txt'])
    print('orig_task'+chr(97+idx1)+'.txt')
    temp_df = sus_df[sus_df['Task'] == chr(97+idx1)]
    for index, row in temp_df.iterrows():
        text_name = row['File']
        cat = row['Category']
        token_sus = pre_processing(sus_dict[text_name])
        feature = LCS_document(token_ori,token_sus) # get the featrue sets for one file
        feature_set.append((feature,cat))


orig_taska.txt
orig_taskb.txt
orig_taskc.txt
orig_taskd.txt
orig_taske.txt


In [188]:
feature_set[93]

({'longest matching sequence': 25,
  'sum of longest matching sequence normalized by the number of suspecious sentence': 12.0833,
  'average of matching sequence': 2.4167},
 'heavy')

In [189]:
# get trainset and test set
trainset = feature_set[:80]
testset = feature_set[80:]

In [190]:
# machine learning
classifier = nltk.NaiveBayesClassifier.train(trainset)


In [191]:
print(nltk.classify.accuracy(classifier,testset))

0.8


In [192]:
classifier.show_most_informative_features(20)

Most Informative Features
longest matching sequence = 7                 non : heavy  =      2.9 : 1.0
longest matching sequence = 6                 non : cut    =      2.7 : 1.0
longest matching sequence = 46                cut : heavy  =      1.8 : 1.0
longest matching sequence = 28                cut : heavy  =      1.8 : 1.0
longest matching sequence = 16              heavy : light  =      1.6 : 1.0
longest matching sequence = 22              light : cut    =      1.6 : 1.0
longest matching sequence = 34              light : cut    =      1.6 : 1.0
longest matching sequence = 13              heavy : non    =      1.5 : 1.0
longest matching sequence = 12              heavy : non    =      1.5 : 1.0
sum of longest matching sequence normalized by the number of suspecious sentence = 8.5             heavy : non    =      1.3 : 1.0
sum of longest matching sequence normalized by the number of suspecious sentence = 7.0             heavy : non    =      1.3 : 1.0
sum of longest matching sequ