In [1]:
import ast
import os
import nbformat
import Levenshtein
import numpy as np
import math
from statistics import mean

In [2]:
class VectorCollector(ast.NodeVisitor):
    def __init__(self):
        self.vector = [0 for i in range(32)]

    def generic_visit(self, node):
        if isinstance(node, (ast.arguments)):
            self.vector[0] += 1
        elif isinstance(node, (ast.Str)):
            self.vector[1] += 1
        elif isinstance(node, (ast.Param)):
            self.vector[2] += 1
        elif isinstance(node, (ast.Tuple)):
            self.vector[3] += 1
        elif isinstance(node, (ast.Compare)):
            self.vector[4] += 1
        elif isinstance(node, (ast.For)):
            self.vector[5] += 1
        elif isinstance(node, (ast.Add)):
            self.vector[6] += 1
        elif isinstance(node, (ast.Expr)):
            self.vector[7] += 1
        elif isinstance(node, (ast.Assign)):
            self.vector[8] += 1
        elif isinstance(node, (ast.Store)):
            self.vector[9] += 1
        elif isinstance(node, (ast.Import)):
            self.vector[10] += 1
        elif isinstance(node, (ast.Eq)):
            self.vector[11] += 1
        elif isinstance(node, (ast.Num)):
            self.vector[12] += 1
        elif isinstance(node, (ast.Return)):
            self.vector[13] += 1
        elif isinstance(node, (ast.Call)):
            self.vector[14] += 1
        elif isinstance(node, (ast.keyword)):
            self.vector[15] += 1
        elif isinstance(node, (ast.While)):
            self.vector[16] += 1
        elif isinstance(node, (ast.List)):
            self.vector[17] += 1
        elif isinstance(node, (ast.Attribute)):
            self.vector[18] += 1
        elif isinstance(node, (ast.Pass)):
            self.vector[19] += 1
        elif isinstance(node, (ast.Name)):
            self.vector[20] += 1
        elif isinstance(node, (ast.Subscript)):
            self.vector[21] += 1
        elif isinstance(node, (ast.Load)):
            self.vector[22] += 1
        elif isinstance(node, (ast.If)):
            self.vector[23] += 1
        elif isinstance(node, (ast.In)):
            self.vector[24] += 1
        elif isinstance(node, (ast.Index)):
            self.vector[25] += 1
        elif isinstance(node, (ast.ClassDef)):
            self.vector[26] += 1
        elif isinstance(node, (ast.Module)):
            self.vector[27] += 1
        elif isinstance(node, (ast.ImportFrom)):
            self.vector[28] += 1
        elif isinstance(node, (ast.FunctionDef)):
            self.vector[29] += 1
        elif isinstance(node, (ast.alias)):
            self.vector[30] += 1
        elif isinstance(node, (ast.Try)):
            self.vector[31] += 1

        ast.NodeVisitor.generic_visit(self, node)

In [3]:
def parse_code(code):
    '''
    将一段代码，解析为一个1*32的向量，如果不能解析，则返回None
    :param code:
    :return:
    '''
    try:
        result = ast.parse(code)
        visitor = VectorCollector()
        visitor.generic_visit(result)
        return visitor.vector

    except Exception as e:
        print(e)
        return None


In [4]:
def isComment(line):
    stripped = line.strip()
    if stripped and stripped[0] == '#':
        return True
    return False


def isMagic(line):
    stripped = line.strip()
    if stripped and stripped[0] == '%':
        return True
    return False


def sanitize(txt):
    lines = txt.split('\n')
    okLines = [l for l in lines if not isComment(l)]
    okLines = [l for l in okLines if not isMagic(l)]
    return '\n'.join(okLines)


def getSourceWithoutTests(filename):
    nb = nbformat.read(filename, 4)
    allCode = ''
    for c in nb.cells:
        if c['cell_type'] == 'code':
            if 'nbgrader' in c['metadata'].keys():
                if c['metadata'].get('editable', True) == False:
                    if c['metadata']['nbgrader'].get('locked', False) == True:
                        # this is a test cell, remove
                        continue

            # print(c)
            allCode += '\n'
            allCode += sanitize(c['source'])

    return allCode

In [5]:
# def bit_product_sum(x, y):
#     return sum([item[0] * item[1] for item in zip(x, y)])


def cosine_similarity(x, y, norm=False):
#     """ 计算两个向量x和y的余弦相似度 """
    assert len(x) == len(y), "len(x) != len(y)"
    zero_list = [0] * len(x)
    if x == zero_list or y == zero_list:
        return float(1) if x == y else float(0)

#     # method 1
    
    res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
    cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))

    # method 2
#     cos = bit_product_sum(x, y) / (np.sqrt(bit_product_sum(x, x)) * np.sqrt(bit_product_sum(y, y)))

    # method 3
#     dot_product, square_sum_x, square_sum_y = 0, 0, 0
#     for i in range(len(x)):
#         dot_product += x[i] * y[i]
#         square_sum_x += x[i] * x[i]
#         square_sum_y += y[i] * y[i]
#     cos = dot_product / (np.sqrt(square_sum_x) * np.sqrt(square_sum_y))
#    method 4
#     AB = 0
#     A2 = 0
#     B2 = 0
#     for a,b in zip(x,y):
#         AB +=a*b
#         A2 +=a**2
#         B2 +=b**2
#     cos = AB/(math.sqrt(A2)*math.sqrt(B2))
#     print (cos)
#     return cos
    if -1<= cos <= 0:
        return (0.5 * cos + 0.5)
    else:
        return cos
#     return 0.5 * cos + 0.5 if norm else cos  # 归一化到[0, 1]区间内

In [6]:
# Levenshtein distance
def edit_distance(code1, code2):
    return 1 - (Levenshtein.distance(code1, code2)) / (max(len(code1), len(code2)))

In [7]:
def generate_str(ref, candidate, percent, error=False):
    '''
    通过模板，生成HTML结果文件中的内容
    :param ref:
    :param candidate:
    :param percent:
    :return:
    '''
    if error is True:
        li_template = '''<li style="color:green;"><span>invalid file : {0}</li></br>'''
        return li_template.format(ref)

    li_template = '''<li style="color:{0};"><span>File 1 : {1}&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span>File 2 : {2}&nbsp;</span><span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;{3}%</span></li></br>'''
    if percent >= 70:
        color = 'red'
    elif percent >= 30:
        color = 'blue'
    else:
        color = 'black'
    return li_template.format(color, ref, candidate, round(percent, 2))

In [8]:
if __name__ == '__main__':
    # 用来生成结果文件的模板
    html_template = '''
        <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>抄袭检测</title>
    </head>
    <body>
    <h1 style="text-align: center;">抄袭检测</h1>
    <ul>
      {0}
    </ul>
    </body>
    
    </html>
    '''
    
    # 读取所有的文件目录
    path = './data/Assignment 8/RandomWalk/last/'

    filenames = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if file_path.endswith('ipynb'):
            filenames.append(file_path)
    invalid_files = []
    valid_files = []
    vectors = []
    codes = []

    for filename in filenames:
        code = getSourceWithoutTests(filename)
        vector = parse_code(code)
        if vector is None :
            invalid_files.append(filename)
        else:
            valid_files.append(filename)
            vectors.append(vector)
            codes.append(code)
    # 计算平均值
    avg_vector = [0 for i in range(32)]
    for index in range(len(vectors)):
        cur_vector = vectors[index]
        for i in range(32):
            avg_vector[i] += cur_vector[i]
    avg_vector = [i/len(vectors) for i in avg_vector]
    for index in range(len(vectors)):
        cur_vector = vectors[index]
        for i in range(32):
            cur_vector[i] -= avg_vector[i]
#     print(avg_vector)

    li_str = ''
#     print('valid',valid_files)
    print('invalid', invalid_files)
    # write invalied files
    for file in invalid_files:
        print(file)
        li_str += generate_str(file, None, 0, True)
    results = []
    for i in range(len(valid_files)):
        source_file = valid_files[i]
        for j in range(i + 1, len(valid_files)):
            target_file = valid_files[j]
            cos_dis = cosine_similarity(vectors[i], vectors[j])
            # edit_dis = edit_distance(codes[i], codes[j])
#             print(i, j, cos_dis)
            results.append([source_file, target_file, cos_dis])
    # 排序
    sorted_results = reversed(sorted(results, key=lambda x: x[-1]))
    # 生成html内容
    for result in sorted_results:
        li_str += generate_str(result[0], result[1], result[2]*100)
    # 将结果写入html

    with open('比较结果.html', 'w') as f:
        f.write(html_template.format(li_str))


expected an indented block (<unknown>, line 43)
invalid syntax (<unknown>, line 25)
expected an indented block (<unknown>, line 28)
unexpected indent (<unknown>, line 68)
invalid syntax (<unknown>, line 56)
invalid syntax (<unknown>, line 64)
invalid syntax (<unknown>, line 61)
invalid ['./data/Assignment 8/RandomWalk/last/07bd49.ipynb', './data/Assignment 8/RandomWalk/last/13503f.ipynb', './data/Assignment 8/RandomWalk/last/1bf4cc.ipynb', './data/Assignment 8/RandomWalk/last/1e5a7c.ipynb', './data/Assignment 8/RandomWalk/last/5c1549.ipynb', './data/Assignment 8/RandomWalk/last/99e233.ipynb', './data/Assignment 8/RandomWalk/last/e0a742.ipynb']
./data/Assignment 8/RandomWalk/last/07bd49.ipynb
./data/Assignment 8/RandomWalk/last/13503f.ipynb
./data/Assignment 8/RandomWalk/last/1bf4cc.ipynb
./data/Assignment 8/RandomWalk/last/1e5a7c.ipynb
./data/Assignment 8/RandomWalk/last/5c1549.ipynb
./data/Assignment 8/RandomWalk/last/99e233.ipynb
./data/Assignment 8/RandomWalk/last/e0a742.ipynb


In [59]:
file=('./data/Assignment 8/RandomWalk/last/test/0bca19.ipynb')

f_code = getSourceWithoutTests(file)
result = ast.parse(f_code)
visitor = VectorCollector()
visitor.generic_visit(result)
print(visitor.vector)


[3, 17, 0, 9, 0, 8, 11, 18, 41, 51, 1, 0, 49, 2, 61, 25, 0, 2, 57, 0, 169, 11, 197, 0, 0, 10, 1, 1, 1, 3, 2, 0]


In [31]:
file2=('./data/Assignment 8/RandomWalk/last/test/0c8ad5.ipynb')

f_code2 = getSourceWithoutTests(file2)
result2 = ast.parse(f_code2)
visitor2 = VectorCollector()
visitor2.generic_visit(result2)
print(visitor2.vector)

[3, 13, 0, 9, 0, 9, 10, 27, 56, 70, 1, 0, 50, 2, 76, 10, 0, 6, 69, 0, 228, 15, 257, 0, 0, 15, 1, 1, 1, 3, 2, 0]


In [33]:
file3=('./data/Assignment 8/RandomWalk/last/test/0d5192.ipynb')

f_code3 = getSourceWithoutTests(file3)
result3 = ast.parse(f_code3)
visitor3 = VectorCollector()
visitor3.generic_visit(result3)
print(visitor3.vector)

[3, 18, 0, 6, 0, 10, 17, 23, 41, 57, 1, 0, 51, 2, 62, 11, 0, 5, 54, 0, 193, 16, 217, 0, 0, 16, 1, 1, 1, 3, 2, 0]


In [34]:
print(list(np.array(visitor2.vector)-np.array(avg_vector)))

[0.0, -3.0, 0.0, 1.0, 0.0, 0.0, -2.666666666666666, 4.333333333333332, 10.0, 10.666666666666664, 0.0, 0.0, 0.0, 0.0, 9.666666666666671, -5.333333333333334, 0.0, 1.666666666666667, 9.0, 0.0, 31.333333333333343, 1.0, 33.33333333333334, 0.0, 0.0, 1.333333333333334, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [35]:
print(avg_vector)

[3.0, 16.0, 0.0, 8.0, 0.0, 9.0, 12.666666666666666, 22.666666666666668, 46.0, 59.333333333333336, 1.0, 0.0, 50.0, 2.0, 66.33333333333333, 15.333333333333334, 0.0, 4.333333333333333, 60.0, 0.0, 196.66666666666666, 14.0, 223.66666666666666, 0.0, 0.0, 13.666666666666666, 1.0, 1.0, 1.0, 3.0, 2.0, 0.0]


In [37]:

cos_dis = cosine_similarity(list(np.array(visitor2.vector)-np.array(avg_vector)),list(np.array(visitor3.vector)-np.array(avg_vector)))

# print(0.5*cos_dis+0.5)
print (cos_dis)

0.15290994481588788
