In [1]:
import ast
import os,shutil
import nbformat
import Levenshtein
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
import astunparse

In [2]:
# Collcet vectors

class VectorCollector(ast.NodeVisitor):
    def __init__(self):
        self.vector = [0 for i in range(32)]

    def generic_visit(self, node):
        if isinstance(node, (ast.arguments)):
            self.vector[0] += 1
        elif isinstance(node, (ast.Str)):
            self.vector[1] += 1
        elif isinstance(node, (ast.Param)):
            self.vector[2] += 1
        elif isinstance(node, (ast.Tuple)):
            self.vector[3] += 1
        elif isinstance(node, (ast.Compare)):
            self.vector[4] += 1
        elif isinstance(node, (ast.For)):
            self.vector[5] += 1
        elif isinstance(node, (ast.Add)):
            self.vector[6] += 1
        elif isinstance(node, (ast.Expr)):
            self.vector[7] += 1
        elif isinstance(node, (ast.Assign)):
            self.vector[8] += 1
        elif isinstance(node, (ast.Store)):
            self.vector[9] += 1
        elif isinstance(node, (ast.Import)):
            self.vector[10] += 1
        elif isinstance(node, (ast.Eq)):
            self.vector[11] += 1
        elif isinstance(node, (ast.Num)):
            self.vector[12] += 1
        elif isinstance(node, (ast.Return)):
            self.vector[13] += 1
        elif isinstance(node, (ast.keyword)):
            self.vector[14] += 1
        elif isinstance(node, (ast.While)):
            self.vector[15] += 1
        elif isinstance(node, (ast.List)):
            self.vector[16] += 1
        elif isinstance(node, (ast.Attribute)):
            self.vector[17] += 1
        elif isinstance(node, (ast.Pass)):
            self.vector[18] += 1
        elif isinstance(node, (ast.Name)):
            self.vector[19] += 1
        elif isinstance(node, (ast.If)):
            self.vector[20] += 1
        elif isinstance(node, (ast.In)):
            self.vector[21] += 1
        elif isinstance(node, (ast.Index)):
            self.vector[22] += 1
        elif isinstance(node, (ast.ClassDef)):
            self.vector[23] += 1
        elif isinstance(node, (ast.Module)):
            self.vector[24] += 1
        elif isinstance(node, (ast.ImportFrom)):
            self.vector[25] += 1
        elif isinstance(node, (ast.FunctionDef)):
            self.vector[26] += 1
        elif isinstance(node, (ast.Try)):
            self.vector[27] += 1
        elif isinstance(node, (ast.Subscript)):
            self.vector[28] += 1
        elif isinstance(node, (ast.Call)):
            self.vector[29] += 1
        elif isinstance(node, (ast.Load)):
            self.vector[30] += 1
        elif isinstance(node, (ast.alias)):
            self.vector[31] += 1

        ast.NodeVisitor.generic_visit(self, node)

In [3]:
def parse_code(code):

#     Parse a piece of code into a 1*28 vector, and return None if it cannot be parsed

    try:
        result = ast.parse(code)
        visitor = VectorCollector()
        visitor.generic_visit(result)
        return visitor.vector

    except Exception as e:
        print(e)
        return None


In [4]:
# Filter code
def isComment(line):
    stripped = line.strip()
    if stripped and stripped[0] == '#':
        return True
    return False


def isMagic(line):
    stripped = line.strip()
    if stripped and stripped[0] == '%':
        return True
    return False


def sanitize(txt):
    lines = txt.split('\n')
    oklines = [l for l in lines if l.strip()]
    okLines = [l for l in oklines if not isComment(l)]
    okLines = [l for l in okLines if not isMagic(l)]
    return '\n'.join(okLines)


def getSourceWithoutTests(filename):
    nb = nbformat.read(filename, 4)
    allCode = ''
    for c in nb.cells:
        if c['cell_type'] == 'code':
            if 'nbgrader' in c['metadata'].keys():
                if c['metadata'].get('editable', True) == False:
                    if c['metadata']['nbgrader'].get('locked', False) == True:
                        # this is a test cell, remove
                        continue

            # print(c)
            allCode += '\n'
            allCode += sanitize(c['source'])

    return allCode


In [5]:
# calculate cosine similarity
def cosine_similarity(x, y, norm=False):
#     """ Calculate the cosine similarity of two vectors x and y """

#     # method 1
    
    res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
    cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))

#    method 4
#     AB = 0
#     A2 = 0
#     B2 = 0
#     for a,b in zip(x,y):
#         AB +=a*b
#         A2 +=a**2
#         B2 +=b**2
#     cos = AB/(math.sqrt(A2)*math.sqrt(B2))
#     print (cos)
#     return cos

# Normalized to the[0, 1]interval
    if -1<= cos <= 0:
        return (0.5 * cos + 0.5)
    else:
        return cos


In [6]:
def generate_str(ref, candidate, percent, error=False):

#     Generate the content of the HTML result file through the template

    if error is True:
        li_template = '''<li style="color:green;"><span>invalid file : {0}</li></br>'''
        return li_template.format(ref)

    li_template = '''<li style="color:{0};"><span>File 1 : {1}&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span>File 2 : {2}&nbsp;</span><span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;{3}%</span></li></br>'''
    if percent >= 70:
        color = 'red'
    elif percent >= 30:
        color = 'blue'
    else:
        color = 'black'
    return li_template.format(color, ref, candidate, np.round(percent, 2))

In [7]:
if __name__ == '__main__':
    # The template used to generate the result file
    html_template = '''
        <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Plagiarism Detection-AST</title>
    </head>
    <body>
    <h1 style="text-align: center;">Plagiarism Detection-AST</h1>
    <ul>
      {0}
    </ul>
    </body>
    
    </html>
    '''
    
# creat directory
    path = './data/Assignment 4/Assignment 4/last/'
    high_dir = path +'high percentage'
    for i in os.listdir(path):
        if os.path.exists(high_dir):
            pass
        else:
            os.mkdir(path+'high percentage')
            
    result_path = './result/Assignment 4/'
    if os.path.exists(result_path):
        pass
    else:
        os.makedirs(result_path)
# read files
    ori_file = './original/Assignment 4/Assignment 4.ipynb'
    f_ori = getSourceWithoutTests(ori_file)
    vector_ori = parse_code(f_ori)
    filenames = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if file_path.endswith('ipynb'):
            filenames.append(file_path)
    invalid_files = []
    valid_files = []
    vectors = []
    codes = []

    for filename in filenames:
        code = getSourceWithoutTests(filename)
        vector = parse_code(code)
#         if vector is None or sum(vector)<threshold:
        if vector is None or sum(np.array(vector)-np.array(vector_ori))<=30:
            invalid_files.append(filename)
        else:
            valid_files.append(filename)
#             vectors.append(vector)
            vectors.append(np.array(vector)-np.array(vector_ori))
#             codes.append(code)

    li_str = ''
#     print('valid',valid_files)
    print('invalid', invalid_files)
    
#     Normalized the feature vectors
    stdScaler = StandardScaler()
    vectors = stdScaler.fit_transform(vectors)
    
    # write invalied files
    for file in invalid_files:
#         print(file)
        li_str += generate_str(file, None, 0, True)
    results = []
    
    #     calculate similarity rate
    for i in range(len(valid_files)):
        source_file = valid_files[i]
        for j in range(i + 1, len(valid_files)):
            target_file = valid_files[j]
            cos_dis = cosine_similarity(vectors[i], vectors[j])
#             edit_dis = edit_distance(codes[i], codes[j])
#             print(i, j, cos_dis)
#             print(type(cos_dis))
            results.append([source_file, target_file, cos_dis])

#    move high percentage files to a new directory
    for result in results:
        if result[2] >= 0.9 :
            new_file_path = high_dir
            try:
                shutil.move(result[0],new_file_path)
                shutil.move(result[1],new_file_path)
                
            except shutil.Error:
                pass

    
    # sort
    sorted_results = reversed(sorted(results, key=lambda x: x[-1]))

    # generate html content
    for result in sorted_results:
        li_str += generate_str(result[0], result[1], result[2]*100)
    # Write results into HTML
    

    with open('./result/Assignment 4/Results of Comparison-AST.html', 'w') as f:
        f.write(html_template.format(li_str))
    

unexpected indent (<unknown>, line 59)
unexpected indent (<unknown>, line 17)
invalid syntax (<unknown>, line 41)
invalid syntax (<unknown>, line 58)
invalid syntax (<unknown>, line 83)
unexpected indent (<unknown>, line 24)
unexpected indent (<unknown>, line 44)
expected an indented block (<unknown>, line 64)
invalid ['./data/Assignment 4/Assignment 4/last/10f591.ipynb', './data/Assignment 4/Assignment 4/last/13503f.ipynb', './data/Assignment 4/Assignment 4/last/1e5a7c.ipynb', './data/Assignment 4/Assignment 4/last/a8a21b.ipynb', './data/Assignment 4/Assignment 4/last/b541d0.ipynb', './data/Assignment 4/Assignment 4/last/b71b61.ipynb', './data/Assignment 4/Assignment 4/last/c708ed.ipynb', './data/Assignment 4/Assignment 4/last/f632e4.ipynb', './data/Assignment 4/Assignment 4/last/f6c959.ipynb']


In [8]:
high7 = 0
high9 = 0
for i in results:
    if i[2]>=0.7:
        high7 +=1
    if i[2]>=0.9:
        high9 +=1
print(high7)
print(high9)
print(round(high7/len(results)*100,2),'%')
print(round(high9/len(results)*100,2),'%')
print(len(results))
print(len(invalid_files))
per = []
for i in results:
    per.append(i[2])
avg = np.mean(per)
print(round((avg)*100,2),'%')

1193
98
7.41 %
0.61 %
16110
9
36.79 %
