<a href="https://colab.research.google.com/github/abhijeetraj22/OMK-Project/blob/main/OMK_Marksheet10_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import pandas as pd

# Load file
file_path = '/content/RAW DATA-12.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Define regex for header/skip lines
skip_patterns = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f'
]

# Prepare data list
records = []
i = 0
while i < len(lines):
    line = lines[i].strip()
    # Skip header/non-data lines
    if any(re.match(pat, line) for pat in skip_patterns) or line == '':
        i += 1
        continue

    # First line: roll, gender, stream, name, subjects, result
    if re.match(r'^\d{8}', line):
        parts = re.split(r'\s{2,}', line)
        roll_gender_stream_name = parts[0].strip()
        subject_codes = parts[1:-1]
        result = parts[-1]

        # Extract roll, gender, stream, name
        roll_match = re.match(r'^(\d{8})\s+([MF])\s+([A-Z]*)\s+(.*)$', roll_gender_stream_name)
        if roll_match:
            roll_no = roll_match.group(1)
            gender = roll_match.group(2)
            stream = roll_match.group(3)
            name = roll_match.group(4).strip()
        else:
            i += 2
            continue

        # Next line: marks and grades
        marks_line = lines[i + 1].strip()
        marks_grades = marks_line.split()

        # Organize marks and grades
        subjects = []
        for j in range(0, len(marks_grades), 2):
            mark = marks_grades[j]
            grade = marks_grades[j + 1] if j + 1 < len(marks_grades) else ''
            subjects.append((mark, grade))

        # Fill up to 7 subjects if needed
        while len(subjects) < 7:
            subjects.append(('', ''))

        records.append({
            "Roll No": roll_no,
            "Gender": gender,
            "Stream": stream,
            "Name": name,
            "Result": result,
            **{f"Sub{i+1} Marks": subjects[i][0] for i in range(7)},
            **{f"Sub{i+1} Grade": subjects[i][1] for i in range(7)},
        })
        i += 2
    else:
        i += 1

# Create DataFrame and export to Excel
df = pd.DataFrame(records)
output_file = '/content/CBSE_Result_Extracted.xlsx'
df.to_excel(output_file, index=False)

print(f"✅ Data extracted and saved to: {output_file}")


✅ Data extracted and saved to: /content/CBSE_Result_Extracted.xlsx


In [3]:
import pandas as pd
import re

# Load file
file_path = '/content/RAW DATA-10.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Patterns to skip
skip_lines = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f', r'^$', r'^REGION:'
]

# Process valid student records (every two relevant lines)
data_rows = []
i = 0
while i < len(lines) - 1:
    line1 = lines[i].strip()
    line2 = lines[i + 1].strip()

    if any(re.match(p, line1) for p in skip_lines) or any(re.match(p, line2) for p in skip_lines):
        i += 1
        continue

    if re.match(r'^\d{8}', line1) and re.search(r'\d{2,3} [A-E1-9]{1,2}', line2):
        full_line = f"{line1}\n{line2}"
        data_rows.append([full_line])
        i += 2
    else:
        i += 1

# Convert to DataFrame
df = pd.DataFrame(data_rows, columns=["Student Result"])

# Save to Excel
output_file = "/content/Formatted_CBSE_Output.xlsx"
df.to_excel(output_file, index=False)

print(f"✅ Output saved to: {output_file}")


✅ Output saved to: /content/Formatted_CBSE_Output.xlsx


In [4]:
import re
import pandas as pd

# Read the text file
file_path = '/content/RAW DATA-10.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Skip these lines
skip_patterns = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f', r'^$', r'^REGION:'
]

data = []
i = 0

while i < len(lines) - 1:
    line1 = lines[i].strip()
    line2 = lines[i + 1].strip()

    if any(re.match(p, line1) for p in skip_patterns) or any(re.match(p, line2) for p in skip_patterns):
        i += 1
        continue

    if re.match(r'^\d{8}', line1) and re.search(r'\d{2,3} [A-E1-9]{1,2}', line2):
        match = re.match(r'^(\d{8})\s+([MF])\s+(.*?)\s{2,}', line1)
        if not match:
            i += 2
            continue

        roll_no = match.group(1)
        gender = match.group(2)
        name = match.group(3).strip()

        result_match = re.search(r'(PASS|FAIL|COMP|ABST)', line1)
        result = result_match.group(1) if result_match else ""

        if result == 'ABST':
            i += 2
            continue

        mg = line2.split()
        subjects = [(mg[j], mg[j+1]) for j in range(0, min(len(mg)-1, 11), 2)]

        if len(subjects) < 6:
            i += 2
            continue

        row = [roll_no, gender, name, result]
        for mark, grade in subjects[:6]:
            row += [mark, grade]

        data.append(row)
        i += 2
    else:
        i += 1

# Define simplified columns
columns = (
    ['Roll No', 'Gender', 'Name', 'Result'] +
    [f'Marks{i+1}' for i in range(6)] +
    [f'Grade{i+1}' for i in range(6)]
)

# Save to Excel
df = pd.DataFrame(data, columns=columns)
output_file = '/content/CBSE_Result_No_Subject_Codes.xlsx'
df.to_excel(output_file, index=False)

print("✅ File saved as:", output_file)


✅ File saved as: /content/CBSE_Result_No_Subject_Codes.xlsx


In [5]:
import re
import pandas as pd

# Load the raw text file
file_path = '/content/RAW DATA-10.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Skip headers/patterns
skip_patterns = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f', r'^$', r'^REGION:'
]

data = []
i = 0

while i < len(lines) - 1:
    line1 = lines[i].strip()
    line2 = lines[i + 1].strip()

    if any(re.match(p, line1) for p in skip_patterns) or any(re.match(p, line2) for p in skip_patterns):
        i += 1
        continue

    if re.match(r'^\d{8}', line1) and re.search(r'\d{2,3} [A-E1-9]{1,2}', line2):
        match = re.match(r'^(\d{8})\s+([MF])\s+(.*?)\s{2,}', line1)
        if not match:
            i += 2
            continue

        roll_no = match.group(1)
        gender = match.group(2)
        name = match.group(3).strip()

        result_match = re.search(r'(PASS|FAIL|COMP|ABST)', line1)
        result = result_match.group(1) if result_match else ""

        if result == 'ABST':
            i += 2
            continue

        mg = line2.split()
        subjects = [(mg[j], mg[j+1]) for j in range(0, min(len(mg)-1, 11), 2)]

        if len(subjects) < 6:
            i += 2
            continue

        row = [roll_no, gender, name]
        for mark, grade in subjects[:6]:
            row += [mark, grade]
        row.append(result)

        data.append(row)
        i += 2
    else:
        i += 1

# Final column structure
columns = (
    ['Roll No', 'Gender', 'Name'] +
    sum([[f'Marks{i+1}', f'Grade{i+1}'] for i in range(6)], []) +
    ['Result']
)

# Create DataFrame and save
df = pd.DataFrame(data, columns=columns)
output_path = '/content/CBSE_Formatted_Result_Final.xlsx'
df.to_excel(output_path, index=False)

print("✅ File saved as:", output_path)


✅ File saved as: /content/CBSE_Formatted_Result_Final.xlsx


In [6]:
import re
import pandas as pd

# Read the raw file
file_path = '/content/RAW DATA-10.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Skip patterns
skip_patterns = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f', r'^$', r'^REGION:'
]

data = []
i = 0

while i < len(lines) - 1:
    line1 = lines[i].strip()
    line2 = lines[i + 1].strip()

    if any(re.match(p, line1) for p in skip_patterns) or any(re.match(p, line2) for p in skip_patterns):
        i += 1
        continue

    if re.match(r'^\d{8}', line1) and re.search(r'\d{2,3} [A-E1-9]{1,2}', line2):
        match = re.match(r'^(\d{8})\s+([MF])\s+(.*?)\s{2,}', line1)
        if not match:
            i += 2
            continue

        roll_no = match.group(1)
        gender = match.group(2)
        name = match.group(3).strip()

        # Subject codes from line1
        subject_codes = re.findall(r'\b\d{2,3}\b', line1)
        result_match = re.search(r'(PASS|FAIL|COMP|ABST)', line1)
        result = result_match.group(1) if result_match else ""

        if result == 'ABST' or len(subject_codes) < 6:
            i += 2
            continue

        # Marks & grades
        mg = line2.split()
        subjects = [(mg[j], mg[j+1]) for j in range(0, min(len(mg)-1, 11), 2)]

        if len(subjects) < 6:
            i += 2
            continue

        row = [roll_no, gender, name] + subject_codes[:6]
        for mark, grade in subjects[:6]:
            row += [mark, grade]
        row.append(result)

        data.append(row)
        i += 2
    else:
        i += 1

# Define columns
columns = (
    ['Roll No', 'Gender', 'Name'] +
    [f'Sub{i+1}' for i in range(6)] +
    [f'Marks{i+1}' for i in range(6)] +
    [f'Grade{i+1}' for i in range(6)] +
    ['Result']
)

# Save to Excel
df = pd.DataFrame(data, columns=columns)
output_path = '/content/CBSE_Final_With_SubjectCodes.xlsx'
df.to_excel(output_path, index=False)

print("✅ File saved:", output_path)


✅ File saved: /content/CBSE_Final_With_SubjectCodes.xlsx


In [10]:
import re
import pandas as pd

# Load text file
file_path = '/content/RAW DATA-10.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Skip headers and page info
skip_patterns = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f', r'^$', r'^REGION:'
]

data = []
i = 0

while i < len(lines) - 1:
    line1 = lines[i].strip()
    line2 = lines[i + 1].strip()

    if any(re.match(p, line1) for p in skip_patterns) or any(re.match(p, line2) for p in skip_patterns):
        i += 1
        continue

    if re.match(r'^\d{8}', line1) and re.search(r'\d{2,3} [A-E1-9]{1,2}', line2):
        match = re.match(r'^(\d{8})\s+([MF])\s+(.*?)\s{2,}', line1)
        if not match:
            i += 2
            continue

        roll_no = match.group(1)
        gender = match.group(2)
        name = match.group(3).strip()

        # Extract subject codes
        subject_codes = re.findall(r'\b\d{2,3}\b', line1)
        result_match = re.search(r'(PASS|FAIL|COMP|ABST)', line1)
        result = result_match.group(1) if result_match else ""

        if result == 'ABST' or len(subject_codes) < 6:
            i += 2
            continue

        # Extract marks & grades
        mg = line2.split()
        subjects = [(mg[j], mg[j+1]) for j in range(0, min(len(mg)-1, 11), 2)]
        if len(subjects) < 6:
            i += 2
            continue

        row = [roll_no, gender, name] + subject_codes[:6]
        for mark, grade in subjects[:6]:
            row += [mark, grade]
        row.append(result)

        data.append(row)
        i += 2
    else:
        i += 1

# Column headers
columns = (
    ['Roll No', 'Gender', 'Name'] +
    [f'Sub{i+1}' for i in range(6)] +
    sum([[f'Marks{i+1}', f'Grade{i+1}'] for i in range(6)], []) +
    ['Result']
)

# Convert to DataFrame and export
df = pd.DataFrame(data, columns=columns)
output_path = '/content/CBSE_Final_Output_With_Subjects_Marks_Grades.xlsx'
df.to_excel(output_path, index=False)

print("✅ File saved:", output_path)



✅ File saved: /content/CBSE_Final_12_Grades.xlsx


In [14]:
import re
import pandas as pd

# Read the input file
file_path = '/content/RAW DATA-12.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Define headers to skip
skip_patterns = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f', r'^$', r'^REGION:'
]

data = []
i = 0

while i < len(lines) - 1:
    line1 = lines[i].strip()
    line2 = lines[i + 1].strip()

    if any(re.match(p, line1) for p in skip_patterns) or not re.match(r'^\d{8}', line1):
        i += 1
        continue

    # Extract Roll No, Gender, Name
    match = re.match(r'^(\d{8})\s+([MF])\s+(.*?)\s+(\d{3})', line1)
    if not match:
        i += 2
        continue

    roll_no = match.group(1)
    gender = match.group(2)
    name = match.group(3).strip()

    # Extract subject codes (first 5 occurrences of 3-digit numbers)
    subject_codes = re.findall(r'\b\d{3}\b', line1)
    subject_codes = subject_codes[:5]

    # Extract result
    result_match = re.search(r'(PASS|FAIL|COMP|ABST)', line1)
    result = result_match.group(1) if result_match else ""

    # Extract marks and grades
    marks_grades = re.findall(r'(\d{2,3})\s+([A-D][1-2])', line2)
    if len(subject_codes) < 5 or len(marks_grades) < 5:
        i += 2
        continue

    row = [roll_no, gender, name] + subject_codes
    for mark, grade in marks_grades[:5]:
        row += [mark, grade]
    row.append(result)

    data.append(row)
    i += 2

# Define columns
columns = (
    ['Roll No', 'Gender', 'Name'] +
    [f'Sub{i+1}' for i in range(5)] +
    sum([[f'Marks{i+1}', f'Grade{i+1}'] for i in range(5)], []) +
    ['Result']
)

# Save to Excel
df = pd.DataFrame(data, columns=columns)
output_file = '/content/CBSE_Extracted_Correct_Format.xlsx'
df.to_excel(output_file, index=False)

print("✅ File saved to:", output_file)


✅ File saved to: /content/CBSE_Extracted_Correct_Format.xlsx


In [15]:
import re
import pandas as pd

# Load the file
file_path = '/content/RAW DATA-12.TXT'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

# Skip headers
skip_patterns = [
    r'^DATE:-', r'^ROLL\s+F', r'^NO\s+L', r'^-+$',
    r'^SCHOOL\s+:\s+-', r'^TOTAL CANDIDATES', r'^\f', r'^$', r'^REGION:'
]

data = []
i = 0

while i < len(lines) - 1:
    line1 = lines[i].strip()
    line2 = lines[i + 1].strip()

    if any(re.match(p, line1) for p in skip_patterns) or not re.match(r'^\d{8}', line1):
        i += 1
        continue

    # Extract Roll No, Gender, Name
    match = re.match(r'^(\d{8})\s+([MF])\s+(.*?)\s+(\d{3})', line1)
    if not match:
        i += 2
        continue

    roll_no = match.group(1)
    gender = match.group(2)
    name = match.group(3).strip()

    # Extract all 3-digit subject codes (first 5)
    subject_codes = re.findall(r'\b\d{3}\b', line1)
    subject_codes = subject_codes[:5]

    # Extract INT-SBJ-GRD grades (A1/A2 etc.) before "PASS"/"FAIL"
    result_match = re.search(r'(A[1-2]|B[1-2]|C[1-2]|D[1-2])(?:\s+)(A[1-2]|B[1-2]|C[1-2]|D[1-2])(?:\s+)(A[1-2]|B[1-2]|C[1-2]|D[1-2])\s+(PASS|FAIL|COMP|ABST)', line1)
    if result_match:
        isg1, isg2, isg3, result = result_match.groups()
    else:
        i += 2
        continue

    # Extract marks and grades from line 2
    marks_grades = re.findall(r'(\d{2,3})\s+([A-D][1-2])', line2)
    if len(subject_codes) < 5 or len(marks_grades) < 5:
        i += 2
        continue

    # Assemble row
    row = [roll_no, gender, name] + subject_codes + [isg1, isg2, isg3]
    for mark, grade in marks_grades[:5]:
        row += [mark, grade]
    row.append(result)

    data.append(row)
    i += 2

# Final columns
columns = (
    ['Roll No', 'Gender', 'Name'] +
    [f'Sub{i+1}' for i in range(5)] +
    [f'ISG{i+1}' for i in range(3)] +
    sum([[f'Marks{i+1}', f'Grade{i+1}'] for i in range(5)], []) +
    ['Result']
)

# Save to Excel
df = pd.DataFrame(data, columns=columns)
output_path = '/content/CBSE_Final_With_INT_SBJ_GRD.xlsx'
df.to_excel(output_path, index=False)

print("✅ File saved:", output_path)


✅ File saved: /content/CBSE_Final_With_INT_SBJ_GRD.xlsx
