In [15]:
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction

# 1. Motif Search
def find_motifs(sequence, motif="TAT"):
    seq = Seq(sequence.upper())
    motif = motif.upper()
    positions = [i+1 for i in range(len(seq) - len(motif) + 1) if seq[i:i+len(motif)] == motif]
    return positions if positions else "None"

# 2. GC Content
def calculate_gc_content(sequence):
    return round(gc_fraction(sequence) * 100, 2)

# 3. Identify Coding Regions (ORFs)
def identify_coding_regions(sequence):
    seq = Seq(sequence.upper())
    stop_codons = {"TAA", "TAG", "TGA"}
    coding_regions = []

    for frame in range(3):
        i = frame
        while i < len(seq) - 2:
            codon = seq[i:i+3]
            if codon == "ATG":
                for j in range(i+3, len(seq)-2, 3):
                    stop = seq[j:j+3]
                    if stop in stop_codons:
                        coding_regions.append((i+1, j+3))
                        i = j + 3
                        break
                else:
                    i += 3
            else:
                i += 3

    return coding_regions if coding_regions else "None"

# 4. Main Analysis
def main():
    # Load dataset
    df = pd.read_csv(r"C:\Users\Ayaan\Downloads\archive\human.txt", sep="\t")

    # Apply predefined functions
    df["GC_Content(%)"] = df["sequence"].apply(calculate_gc_content)
    df["Motif_Positions(TAT)"] = df["sequence"].apply(find_motifs)
    df["Coding_Regions"] = df["sequence"].apply(identify_coding_regions)

    # Final results
    print(df[["class", "GC_Content(%)", "Motif_Positions(TAT)", "Coding_Regions"]])

if __name__ == "__main__":
    main()


      class  GC_Content(%)                               Motif_Positions(TAT)  \
0         4          39.61                                  [23, 59, 83, 139]   
1         4          44.20  [74, 86, 106, 210, 218, 234, 293, 297, 323, 34...   
2         3          43.12  [63, 199, 205, 313, 345, 418, 470, 553, 610, 6...   
3         3          41.79  [63, 199, 205, 313, 345, 418, 470, 553, 610, 6...   
4         3          42.73  [64, 96, 169, 221, 304, 361, 397, 521, 592, 66...   
...     ...            ...                                                ...   
4375      0          29.82                                       [23, 43, 49]   
4376      6          52.10  [352, 388, 454, 703, 778, 814, 850, 874, 888, ...   
4377      6          51.88  [352, 388, 454, 703, 778, 814, 850, 874, 888, ...   
4378      6          55.11                                 [76, 88, 477, 520]   
4379      6          57.30                                    [352, 388, 454]   

                           