In [1]:
import os
import glob
import re
import csv
import shutil

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

input_files = ['0','SelectionF15','SelectionS16']
output = "Puzzles_CSV"

extract = False


In [2]:
label_dict = [
["P", "{", "}"], 
["O", "system.out.print", "system.out.println", "system.out.printf", "cout"],
["I", "stdin", "scanner", "input.nextLine()", "cin"],
["V", "short", "int", "char", "unsigned", "double" , "float", "long"],
["F", "if"],
["E", "else"],
["S", "submit"]
]

substr_tuple = ('Since you quit', 'You took', 'Time spent', 'The Problem:', 'Here is a summary',  "Using Template")

substr_tuple_2 = ('Moved from', 'Submit', 'Reordered')

substr_tuple_3 = ('from solution to trash', 
                  'from problem to trash',
                  'from solution to problem', 
                  'from trash to problem',
                  'from problem to solution', 
                  'from trash to solution')

In [3]:
def obtain_steps(l):
    s = re.search('You took (.+?) steps', l)
    o = re.search('containing (.+?) lines of code', l)

    return int(s.group(1)), int(o.group(1))
        
def obtain_time(l):
    t = re.search('Time spent on this problem: (.+?) seconds', l)

    return int(t.group(1))

In [4]:
def find_using_os_walk(path, path_list, name_list, puzzle_list):
    for root, _, files in os.walk(path):
        
        # ignore hidden files like .DS_store
        files = [f for f in files if not f[0] == '.']
          
        for file_name in files:
            puzzle_list.append(root.split('%s/'% (path),1)[1] )
            file_path = os.path.join(root, file_name)
  
            # Appends file name and file path to
            # name_list and path_list respectively.
            name_list.append(file_name)
            path_list.append(file_path)
    return path_list, name_list, puzzle_list



In [5]:
def labeling(l):
    for i, value in enumerate(label_dict):
        if any(substring in l.lower() for substring in value[1:]):
            return value[0]

    return "Z"

def substring(l):
    start = l.find(":") + len(":")
    end = l.find("\n")
    substring = l[start:end]
    
    return substring

def action_clean(action_list):
    new_action_list = []
    
    for l in action_list:
        if any(substring in l for substring in substr_tuple_2):
            new_action_list.append(l)
                       
    return new_action_list

In [6]:
def form_sequence(a):
    
    index = 0
    keyList = ["Action", "Line", "Value"]
    temp = pd.DataFrame(columns = keyList)
    
    
    for line in a:
        value = labeling(line)  

        temp.loc[index, 'Action'] = line.partition(':')[0]
        temp.loc[index, 'Line'] = substring(line)
        temp.loc[index, 'Value'] = value

        index += 1
    
    trash_list = []
    from_solution = []
    
    
    
    for i, row in temp.iterrows():
        if row['Action'].find(substr_tuple_2[2]) != -1:
            row['Value'] = 'R'
        
        # from solution to trash
        elif row['Action'].find(substr_tuple_3[0]) != -1:
                from_solution.append(row['Line'])
                trash_list.append(row['Line'])
                row['Value'] = 'T'
        
        # from problem to trash
        elif row['Action'].find(substr_tuple_3[1]) != -1:
                trash_list.append(row['Line'])
                row['Value'] = 'T'
        
        # from solution to problem
        elif row['Action'].find(substr_tuple_3[2]) != -1:
            from_solution.append(row['Line'])
            row['Value'] = 'R'
        
        # from trash to problem
        elif row['Action'].find(substr_tuple_3[3]) != -1:
                trash_list.remove(row['Line'])
                row['Value'] = 'R'
            
        # from problem to solution
        elif row['Action'].find(substr_tuple_3[4]) != -1:
            if row['Line'] in from_solution:
                from_solution.remove(row['Line'])
                row['Value'] = 'R'
        
        # from trash to solution
        elif row['Action'].find(substr_tuple_3[5]) != -1:
            if row['Line'] in trash_list:    
                trash_list.remove(row['Line'])
                    
                if row['Line'] in from_solution:    
                    from_solution.remove(row['Line'])
                    row['Value'] = 'R'
    
    # Eliminate '{}' from trash list to avoid confusing the distractors
    trash_list = [i for n, i in enumerate(trash_list) if i.find('{') == -1 and i.find('}') == -1] 
    
    # Determine 'Distractors'
    for i, row in temp.iterrows():
        if row['Line'] in trash_list and row['Value'] != 'T' and row['Value'] != 'R':
            row['Value'] = row['Value'].lower()
    
    seq = []

    for x in temp['Value']:
        seq.append(x)

    sequence = ''.join(seq)
    
    return sequence

In [7]:
flag = 0
columns_names = ['File Path', 'Puzzle', 'Student']

for semester in input_files:
    path_list, name_list, puzzle_list = find_using_os_walk('Puzzles/%s'% (semester), [], [], [])
    d = {'File Path':path_list, 'Student':name_list, 'Puzzle':puzzle_list}
    d = pd.DataFrame(d, columns = columns_names)
    
    if flag == 0:
        df = pd.DataFrame(columns = columns_names)
        df.append(d, ignore_index = True)
        flag = 1
    else:
        df = df.append(d, ignore_index = True)

new_column = ["steps", "lines", "time"]
df = df.reindex(columns=[*df.columns.tolist(), *new_column], fill_value = 0)

new_column = ["sequence"]
df = df.reindex(columns=[*df.columns.tolist(), *new_column])



index = 0


for path in df['File Path']:
    incomplete_flag = 0

    with open (path, "rt") as in_file:

        actions = []
        seq = ''


        extract = True


        for line in in_file:
            # Check if the puzzle was completed.
            if line.find(substr_tuple[0]) == 0:
                incomplete_flag = 1

            if line.find(substr_tuple[1]) == 0:
                step, total_line = obtain_steps(line)

            if line.find(substr_tuple[2]) == 0:
                time = obtain_time(line)

            if line.find(substr_tuple[4]) == 0:
                extract = False

            elif extract:
                actions.append(line)

        if incomplete_flag == 0:
            # Extract only solution and problem
            actions = action_clean(actions)

            seq = form_sequence(actions)


            df.loc[index, 'steps'] = step
            df.loc[index, 'lines'] = total_line
            df.loc[index, 'time'] = time
            df.loc[index, 'sequence'] = seq

        index = index + 1
              
# Clear those rows with no sequence.        
df = df[pd.notna(df["sequence"])]
df = df.reset_index(inplace = False, drop=True)       

In [8]:
df_2 = df[df['Puzzle'] == 'puzzle_1']

if os.path.exists(output):
    shutil.rmtree(output)

if not os.path.exists(output):
    os.makedirs(output)

df_2.to_csv("%s/Puzzle_1.csv"%(output), index = False)

In [9]:
# def processedString(str1, removedChar):
#     transStr1 = str1.maketrans('','', removedChar)
#     str1 = str1.translate(transStr1)

#     return str1

In [10]:
# df_2 = df
# index = 0
# new_column = ["T", "R", "Distractor", "Offset", "New_seq", "New_steps"]
# df_2 = df_2.reindex(columns=[*df_2.columns.tolist(), *new_column], fill_value = 0)


# for st in df['sequence']:
#     num_T = 0
#     num_R = 0
#     num_D = 0
#     num_offset = 0
    
#     for ch in st:
#         if ch == 'T':
#             num_T += 1
#         elif ch == 'R':
#             num_R += 1
#         elif ch.islower():
#             num_D += 1
        
#     num_offset = 16+num_R
    
#     df_2.loc[index, 'T'] = num_T
#     df_2.loc[index, 'R'] = num_R
#     df_2.loc[index, 'Distractor'] = num_D
#     df_2.loc[index, 'Offset'] = num_offset
    
#     new_seq = processedString(st, 'SRMTP')
#     new_steps = len(new_seq)
    
#     df_2.loc[index, 'New_seq'] = new_seq
#     df_2.loc[index, 'New_steps'] = new_steps
        
#     index = index + 1
    
            
        

In [11]:
# df_2[df_2['Puzzle'] == 'puzzle_1']

In [8]:
n = 5
l = [1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5]

test = []
for i in range(1, n+1):
    test.append(l.count(i))


test

[1, 2, 3, 2, 3]

In [24]:
import pandas as pd

data = {'Language':['Python', 'Python',
                   'Javascript',
                   'C#', 'PHP'],
       'University':['LiU', 'LiU',
              'UmU', 'GU','UmU'],
       'Age':[1, 1, 3, 2, 1]}

df3 = pd.DataFrame(data)

list(df3.Age.value_counts())

[3, 1, 1]