In [15]:
import csv

database_file = "databases/small.csv"
sequence_file = "sequences/1.txt"

In [13]:
with open(database_file, newline="") as f:
    reader = csv.DictReader(f)
    str_names = reader.fieldnames[1:]   # skip the name column
    people = list(reader)

print("STR names:", str_names)
print("Database rows:")
for row in people:
    print(row)


STR names: ['AGATC', 'AATG', 'TATC']
Database rows:
{'name': 'Alice', 'AGATC': '2', 'AATG': '8', 'TATC': '3'}
{'name': 'Bob', 'AGATC': '4', 'AATG': '1', 'TATC': '5'}
{'name': 'Charlie', 'AGATC': '3', 'AATG': '2', 'TATC': '5'}


In [18]:
with open(sequence_file, "r") as f:
    sequence = f.read().strip()

print("sequence length:", len(sequence))
print(sequence)   # sequence


sequence length: 170
AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG


In [19]:
def find_pattern_indices(sequence: str, pattern: str):
    indices = []
    pat_len = len(pattern)
    seq_len = len(sequence)

    for i in range(seq_len - pat_len + 1):
        # 如果呢段 substring = pattern，就記錄 index
        if sequence[i:i + pat_len] == pattern:
            indices.append(i)

    return indices


In [31]:
indices = find_pattern_indices(sequence, "AGATC")
print(indices[:10])   # print first few indices
print("Total found:", len(indices))



[55, 60, 65, 70]
Total found: 4


In [32]:
def max_consecutive_from_indices(indices, pattern_len: int) -> int:
    """
    根據 pattern 出現嘅 indices，計算最多有幾多個 pattern 係「連住」出現。
    例如 indices = [40, 45, 50, 80, 85], pattern_len = 5
    → 第一串：40 → 45 → 50  = 3 次
    → 第二串：80 → 85       = 2 次
    → 回傳 3
    """
    if not indices:   # 如果完全冇出現 pattern
        return 0

    max_run = 1       # 最少都有 1（因為有出現就算一個）
    current_run = 1   # 而家呢串連住嘅長度

    # zip(indices, indices[1:]) 會變成一對一對 (prev, curr)
    for prev, curr in zip(indices, indices[1:]):
        # 如果兩個 index 差距 = pattern 長度，代表係黐住
        if curr - prev == pattern_len:
            current_run += 1
        else:
            # 斷咗：比較下而家呢串有幾長，更新 max_run
            if current_run > max_run:
                max_run = current_run
            # 然後由頭計新一串
            current_run = 1

    # loop 完最後嗰串都要同 max_run 比一次
    if current_run > max_run:
        max_run = current_run

    return max_run


In [33]:
def longest_str_run(sequence: str, pattern: str) -> int:
    indices = find_pattern_indices(sequence, pattern)
    # 你可以暫時 print 出嚟睇下會清楚啲：
    # print("indices for", pattern, ":", indices)
    return max_consecutive_from_indices(indices, len(pattern))


In [35]:
str_counts = {}
for s in str_names:
    str_counts[s] = longest_str_run(sequence, s)
print(str_counts)


{'AGATC': 4, 'AATG': 1, 'TATC': 5}


In [37]:
match_name = "No match"

for person in people:   # people = list(reader)
    match = True

    for s in str_names:
        # CSV 裡係 string，要 int(person[s])
        if int(person[s]) != str_counts[s]:
            match = False
            break

    if match:
        match_name = person["name"]
        break
print("Match result:", match_name)


Match result: Bob


In [38]:
# 計算 STR counts
str_counts = {}

for s in str_names:
    count = longest_str_run(sequence, s)
    str_counts[s] = count

print("STR counts found in DNA sequence:")
print(str_counts)

# 對比 CSV
match_name = "No match"

for person in people:
    match = True
    for s in str_names:
        if int(person[s]) != str_counts[s]:
            match = False
            break
    if match:
        match_name = person["name"]
        break

print("Match result:", match_name)


STR counts found in DNA sequence:
{'AGATC': 4, 'AATG': 1, 'TATC': 5}
Match result: Bob


In [40]:
for person in people:
    print("----")
    print("Person:", person["name"])

    for s in str_names:
        print(f"  {s}: CSV={int(person[s])}, DNA={str_counts[s]}")


----
Person: Alice
  AGATC: CSV=2, DNA=4
  AATG: CSV=8, DNA=1
  TATC: CSV=3, DNA=5
----
Person: Bob
  AGATC: CSV=4, DNA=4
  AATG: CSV=1, DNA=1
  TATC: CSV=5, DNA=5
----
Person: Charlie
  AGATC: CSV=3, DNA=4
  AATG: CSV=2, DNA=1
  TATC: CSV=5, DNA=5
