# Using LLM (CodeLlama-7b-Instruct-hf) for finding errors with code, giving completion score & find thought process of students
## Reasons to use CodeLlama-7b-Instruct-hf:
## •out of 7b,13b,34b,70b models ideally the most accurate results would be given by 70b parameters model & 34b model gives the best mix of speed and accuracy
## •we use 7b model due to **computational resources limits** (20 gb GPU VRAM limit)

In [None]:
import warnings
warnings.filterwarnings('ignore')
from zipfile import ZipFile
import os
from pathlib import Path
import io
import transformers
import torch
import pandas as pd
import re
import random
from transformers import pipeline

# Initializing Model and Pipeline
## Initializing Text-Generation Pipeline 

In [None]:
model = "codellama/CodeLlama-7b-Instruct-hf"

# Initializing the pipeline with the specific model
pipe = pipeline("text-generation", model=model,
    torch_dtype=torch.float16,
    device_map="auto",)

# Making a function to convert each text file with timestamped code to a DataFrame with Relevant Details.
## •Aids in easier prompting.
## •Each Row is one Timestamp.
## •In case there are more than 5 snapshots for a code, I have chosen only 5 snapshots for generating reports. (due to time limit and large number of reports to be made)

In [None]:
def parse_file_with_details_to_df(name,content):
    blocks = content.split('.. activecode::')
    timestamps, codes, languages, activecodes = [], [], [], []

    for block in blocks[1:]:  # Skipping the first split as it's before the first activecode block
        
        # Extracting timestamp
        timestamp_match = re.search(r':timestamp: ([\d-]+\s[\d:]+)', block)
        # Finding the start of the code block
        code_start = block.find(':code:') + len(':code:')
        # Extracting the coding language
        language_match = re.search(r':language: (\w+)', block)
        # Extract the activecode block name
        activecode_match = re.search(r'(\w+)\n', block)

        if timestamp_match and code_start > len(':code:'):
            timestamp = timestamp_match.group(1)
            code = block[code_start:].strip()  # Extract code, stripping leading/trailing whitespace
            language = language_match.group(1) if language_match else "Unknown"
            activecode = activecode_match.group(1) if activecode_match else "N/A"

            timestamps.append(timestamp)
            codes.append(code)
            languages.append(language)
            activecodes.append(activecode)

    # Creating a DataFrame with the extracted details
    df = pd.DataFrame({
        'Timestamp': pd.to_datetime(timestamps),
        'Language': languages,
        'ActiveCode': activecodes,
        'Code': codes
    })
    
    if len(df) > 5:
        rows_to_keep = [0]  # First row
        middle_indices = random.sample(range(1, len(df) - 1), k=3)
        rows_to_keep.extend(sorted(middle_indices))
        rows_to_keep.append(len(df)-1)  # Last row
        df = df.iloc[rows_to_keep]

    return df

# **NEXT FUNCTION PERFORMS THE FOLLOWING :** 

# THEME/QUESTION GUESSING (to give LLM idea on how to evaluate student codes)
## •Since the questions corresponding solutions are not available, I am using the model to guess the question based on the solution and then usimg it for evaluation
## •If the Questions were made available (since they are already present in a database), the report would be more robust

## Initializing 'History' : Passes last few inference outputs to the LLM as part of prompt to give idea about sequential codes
## Initializing 'flow' : Will be converted to a Report in .txt format

# ERROR FINDING ON THE BASIS OF THE QUESTION
## •We evaluate the code and students performance
## •For now, we allow the model to give report in its own way since no report format is mentioned
## •Ideally, if a given format is decided on, we can fine-tune the LLM or use multi shot prompting to give outputs in desired format

# Making an automated Report & Saving in a Reports Folder

In [None]:
def process_reports(name,content):
    
    #Using function to convert .txt file to a dataframe for easier processing
    df = parse_file_with_details_to_df(name,content)
    
    #prompting for Question Generation
    prompt = '''<s>[INST]
    {{ '''+ df['Code'][0] + '''
       find the question this code is trying to solve:

    }} [/INST]'''


    result = pipe(prompt, max_length=1000, return_full_text=False)
    
    #Saving the Question generated as theme of the problem
    theme = result[0]['generated_text']
    

    history_list = []

    history = str()
    
    #Documenting the whole coding process of the student in 'flow'
    flow = "CODE TOPIC : " + str(df['ActiveCode'][0])  + "\nQUESTION : " + theme + "\n \n"
    
    #Looping through the different Timestamps in the DataFrame to find and document progress of Student
    for index, row in df.iterrows():
        timestamp = row ['Timestamp']
        Language = row ['Language']
        code = row['Code']
        timestep = str(index)

        if (len(history_list))>2:
            history = ' '.join(history_list[-2:])
        else :
            history = ' '.join(history_list)

        if index==0:
            prompt = '''<s><<SYS>>Help find any issues with the students thinking process \n<</SYS>>[INST] Given Question :''' + theme +'''Language:'''+ Language + '''Timestep :''' + timestep +'''\n''' + code + '''explain errors in the Student's new code trying to solve the given question and give him an overall score out of 100. I dont need corrected code:}} [/INST]'''
        else:
            prompt = '''<s><<SYS>>Help find any issues with the students thinking process \n<</SYS>>[INST] Given Question :''' + theme +'''Language:'''+ Language + '''Timestep :''' + timestep +''' here is a list of errors the student was previously facing at different given timestamps: }} [/INST] {{''' + history + '''}} </s><s>[INST] {{Does this new code resolve any of the previous issues :''' + code + '''explain errors in the Student's new code trying to solve the given question and give him an overall score out of 100. I dont need corrected code:}} [/INST]'''
            
        #Generating text/code based on the solution prompt
        result = pipe(prompt, max_length=5000, return_full_text=False, temperature = 0.7, top_p = 0.95, top_k = 250, do_sample = True)

        generated = result[0]['generated_text']
        
        # Extracting the generated Message and putting it into a meaningful form
        temporary_history = "Timestep :" + timestep + "\nIssues : \n" + generated
        flow = flow + "\n" + "Timestamp :" + str(timestamp)+ "\nCode : \n" + code + "\nIssues : \n" + generated

        history_list.append(temporary_history)
        
    #Defining Path for savin Reports
    report_path = "Reports" + "/".join(name.split("/")[1:])
    report_dir = os.path.dirname(report_path)
        
    # Creating necessary directories for saving reports
    if not os.path.exists(report_dir):
        os.makedirs(report_dir, exist_ok=True)
        
    # Writing the processed reports
    with open(report_path, 'w') as file:
        file.write(flow)

In [None]:
# Defining the path to the directory containing the .txt files
directory_path = '/kaggle/input/psychic-invention/psychic-invention-main'


#Function to initiate processing of all .txt files Given
def process_txt_files_in_directory(directory_path):
    
    # Iterating over to find and process the .txt files in the directory
    
    for root, dirs, files in os.walk(directory_path):
        
        for file in files:
            
            if file.endswith('.txt'):
                file_path = Path(root) / file
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    # Calling Function to start processing the extracted text fiie
                    process_reports(file_path.as_posix(), content)
                    print(f"Processed file: {file_path}")  # Placeholder for actual processing

# Calling the function to process .txt files in the specified directory
process_txt_files_in_directory(directory_path)

In [None]:
!zip -r REPORTS.zip /kaggle/working

In [None]:

from IPython.display import FileLink
FileLink(r'REPORTS.zip')