In [1]:
import ast
import os,shutil
import nbformat
import Levenshtein
import numpy as np
import math
import re
import pandas as pd 

In [2]:
def isComment(line):
    stripped = line.strip()
    if stripped and stripped[0] == '#':
        return True
    return False


def isMagic(line):
    stripped = line.strip()
    if stripped and stripped[0] == '%':
        return True
    return False


def sanitize(txt):
    lines = txt.split('\n')
    oklines = [l for l in lines if l.strip()]
    okLines = [l for l in oklines if not isComment(l)]
    okLines = [l for l in okLines if not isMagic(l)]
    return '\n'.join(okLines)


def getSourceWithoutTests(filename):
    nb = nbformat.read(filename, 4)
    allCode = ''
    for c in nb.cells:
        if c['cell_type'] == 'code':
            if 'nbgrader' in c['metadata'].keys():
                if c['metadata'].get('editable', True) == False:
                    if c['metadata']['nbgrader'].get('locked', False) == True:
                        # this is a test cell, remove
                        continue

            # print(c)
            allCode += '\n'
            allCode += sanitize(c['source'])

    return allCode

In [3]:
#Collect Variable Names

class VarnameCollector(ast.NodeVisitor):
    def __init__(self):
        self.stats = {"variable": []}

    def visit_Name(self, node):
#         print (node.id)
        self.stats["variable"].append(node.id)
        self.generic_visit(node)

    def report(self):
        return self.stats
    
def get_varname(code):
    parse_code = ast.parse(code)
    get_name = VarnameCollector()
    get_name.generic_visit(parse_code)
    return str(get_name.report())


In [4]:
# Levenshtein distance
def edit_distance(code1, code2):
    return 1 - (Levenshtein.distance(code1, code2)) / (max(len(code1), len(code2)))

In [5]:
def generate_str(ref, candidate, percent, error=False):

#     Generate the content of the HTML result file through the template

    if error is True:
        li_template = '''<li style="color:green;"><span>invalid file : {0}</li></br>'''
        return li_template.format(ref)

    li_template = '''<li style="color:{0};"><span>File 1 : {1}&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span>File 2 : {2}&nbsp;</span><span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;{3}%</span></li></br>'''
    if percent >= 70:
        color = 'red'
    elif percent >= 30:
        color = 'blue'
    else:
        color = 'black'
    return li_template.format(color, ref, candidate, round(percent, 2))

In [6]:
if __name__ == "__main__":
    # The template used to generate the result file
    html_template = '''
        <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Plagiarism Detection-Edit Distance</title>
    </head>
    <body>
    <h1 style="text-align: center;">Plagiarism Detection-Edit Distance</h1>
    <ul>
      {0}
    </ul>
    </body>
    
    </html>
    '''
    #get directory and the variable names of original files
    path = './data/Assignment 4/Assignment 4/last/high percentage/'
    li_str = ''
    high_edit =  path+'plagiarism'
    ori_file = './original/Assignment 4/Assignment 4.ipynb'
    f_ori = getSourceWithoutTests(ori_file)
    var_ori = get_varname(f_ori)
    split_ori = frozenset(re.split(r'[;,\s]\s*',var_ori))
    for i in os.listdir(path):
        if os.path.exists(high_edit):
#         print("目录已存在")
            pass
        else:
            os.mkdir(high_edit)

#read files
    filenames = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if file_path.endswith('ipynb'):
            filenames.append(file_path)
    codes = []
    var_names = []

    for filename in filenames:
        code = getSourceWithoutTests(filename)
        var_names.append(get_varname(code))
        
#     print(var_names)
    results = []
    for i in range(len(filenames)):
        source_file = filenames[i]
        
#Remove duplicate variable names with source file,and calculate Levenshtein distance
        for j in range(i + 1, len(filenames)):
            target_file = filenames[j]
#             edit_dis = edit_distance(var_names[i],var_names[j])
#             cos_dis = cosine_similarity(vectors[i], vectors[j])
            edit_dis = edit_distance(str(frozenset.difference(frozenset(re.split(r'[;,\s]\s*',var_names[i])),split_ori)),
                                     str(frozenset.difference(frozenset(re.split(r'[;,\s]\s*',var_names[j])),split_ori)))
#             print(edit_dis)
            results.append([source_file, target_file, edit_dis])
    # sort
    sorted_results = reversed(sorted(results, key=lambda x: x[-1]))
    # generate html content
    for result in sorted_results:
        li_str += generate_str(result[0], result[1], result[2]*100)
 #    move high percentage files to a new directory

    for result in results:
        if result[2] >= 0.9 :
            new_file_path = high_edit
            try:
                shutil.move(result[1],new_file_path)
                shutil.move(result[0],new_file_path)
            except shutil.Error:
                pass

#  Write results into HTML
    with open('./result/Assignment 4/Results of Comparison-Edit Distance.html', 'w') as f:
        f.write(html_template.format(li_str))


    
        

    
            

In [19]:
test_file = './data/Assignment 1/Assignment 1/last/high percentage/dd3c0f.ipynb '
f_code = getSourceWithoutTests(test_file)
var_name = get_varname(f_code)
split_var = frozenset(re.split(r'[;,\s]\s*',var_name))

test_file2 = './data/Assignment 1/Assignment 1/last/high percentage/f49257.ipynb '
f_code2 = getSourceWithoutTests(test_file2)
var_name2 = get_varname(f_code2)
split_var2 = frozenset(re.split(r'[;,\s]\s*',var_name2))


# ori_file = './original/Assignment 8/RandomWalk.ipynb'
# f_ori = getSourceWithoutTests(ori_file)
# var_ori = get_varname(f_ori)
# split_ori = frozenset(re.split(r'[;,\s]\s*',var_ori))
# edit_dis = edit_distance(var_name,var_ori)
# print(edit_dis)
# c = set(split_var+split_ori)
# print(c)
# print(set(split_var))
# frame=frame.drop_duplicates(['variable']) 
# frozenset.difference(split_var,split_ori)
var1 =str(frozenset.difference(split_var,split_ori))
var2 = str(frozenset.difference(split_var2,split_ori))
# edit_dis2 = Levenshtein.distance(var1,var2)
# print(edit_dis2)
# print(edit_dis2)
print(var1)
print(var2)
# print(set(var_name)
print(split_ori)
print(var_name2)
print(var_name)

frozenset({"'x'", "['numpy'", "'df_dx_3'", "'plt']}", "'dx'"})
frozenset({"'x'", "['numpy'", "'df_dx_3'", "'plt']}", "'dx'"})
frozenset({"'df_dx_analytical']}", "'df_dx_2'", "['xs'", "'xs'", "'numpy'", "'df_dx_analytical'", "'f'", "'df_dx_1'", "'plt'", "'df_analytic'", "'backward_difference'", "{'variable':"})
{'variable': ['numpy', 'x', 'numpy', 'x', 'f', 'x', 'f', 'x', 'dx', 'dx', 'xs', 'numpy', 'numpy', 'numpy', 'df_dx_1', 'backward_difference', 'f', 'xs', 'df_dx_2', 'backward_difference', 'f', 'xs', 'df_dx_3', 'backward_difference', 'f', 'xs', 'df_dx_analytical', 'df_analytic', 'xs', 'plt', 'plt', 'plt', 'plt', 'plt', 'xs', 'df_dx_1', 'df_dx_analytical', 'plt', 'xs', 'df_dx_2', 'df_dx_analytical', 'plt', 'xs', 'df_dx_3', 'df_dx_analytical', 'plt']}
{'variable': ['numpy', 'x', 'numpy', 'x', 'f', 'x', 'f', 'x', 'dx', 'dx', 'xs', 'numpy', 'numpy', 'numpy', 'df_dx_1', 'backward_difference', 'f', 'xs', 'df_dx_2', 'backward_difference', 'f', 'xs', 'df_dx_3', 'backward_difference', 'f', '

In [13]:
print(var_ori)

{'variable': ['area_cb', 'get_area', 'r_cb', 'mass_cb', 'get_mass', 'r_cb', 'x', 'y', 'vx', 'vy', 'r', 'dx_t', 'dy_dt', 'dvx_dt', 'dvy_dt', 'numpy', 'dx_dt', 'dy_dt', 'dvx_dt', 'dvy_dt', 'history', 'numpy', 'n_steps', 'history', 'state_initial', 'history', 'y1', 'y2', 'print', 'all_xs', 'history', 'all_ys', 'history', 'negatives', 'numpy', 'all_ys', 'len', 'negatives', 'print', 'all_xs', 'index', 'negatives', 'y1', 'y2', 'all_ys', 'index', 'all_ys', 'index', 'x1', 'x2', 'all_xs', 'index', 'all_xs', 'index', 'find_zero_linear', 'x1', 'x2', 'y1', 'y2', 'r_test', 'h', 'solve_euler', 'r_test', 'numpy', 'find_range', 'h', 'n_steps', 'thetas', 'range', 'initial_conditions', 'values_euler', 'solve_euler', 'initial_conditions', 'n_steps', 'xs_euler', 'ys_euler', 'values_euler', 'values_euler', 'plt', 'xs_euler', 'ys_euler', 'plt', 'plt', 'n_steps', 'max_time', 'v0s', 'numpy', 'ranges', 'ranges_noresistance', 'theta', 'numpy']}
