In [5]:
import pandas as pd
from gtfparse import read_gtf

def parse_attr(attr):
    items = attr.split('; ')
    res={}
    for it in items:
        its = it.split(' ')
        its = [it.replace('"','').replace(';','') for it in its]
        res[its[0]] = its[1]
    return res
    
# 1. 读取 GTF 文件
gtf_file = "gencode.v36.annotation.gtf.gz"
gtf_data = pd.read_csv(gtf_file,sep='\t',header=None,comment='#',
                       names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])

# 提取基因信息
genes = gtf_data[gtf_data['feature'] == 'gene']
genes_attr = genes.attribute.apply(parse_attr).apply(pd.Series)
genes['length'] = genes['end'] - genes['start'] + 1
dfg = genes[['seqname', 'source', 'score', 'strand', 'frame', 'length']].join(genes_attr[['gene_id', 'gene_type', 'gene_name']])
dfg = dfg.set_index('gene_id')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genes['length'] = genes['end'] - genes['start'] + 1


In [7]:
dfg.to_csv('./gene_info.csv')

In [10]:
import pandas as pd
from collections import defaultdict

# 1. Load the GTF file as a DataFrame
gtf = pd.read_csv("gencode.v36.annotation.gtf.gz", sep='\t', comment='#', header=None,
                  names=["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"])

# 2. Filter for exons only
exons = gtf[gtf["feature"] == "exon"].copy()

# 3. Extract gene_name from the attribute column
import re

def extract_gene_name(attr_str):
    match = re.search(r'gene_name "([^"]+)"', attr_str)
    return match.group(1) if match else None

exons["gene_name"] = exons["attribute"].apply(extract_gene_name)

# 4. Group exons by gene and merge overlapping intervals
def merge_intervals(intervals):
    if not intervals:
        return []
    # Sort intervals by start
    intervals = sorted(intervals, key=lambda x: x[0])
    merged = [intervals[0]]
    for current in intervals[1:]:
        prev = merged[-1]
        if current[0] <= prev[1]:
            merged[-1] = (prev[0], max(prev[1], current[1]))
        else:
            merged.append(current)
    return merged

# 5. Compute gene lengths
gene_intervals = defaultdict(list)
for _, row in exons.iterrows():
    gene = row["gene_name"]
    start = row["start"]
    end = row["end"]
    gene_intervals[gene].append((start, end))

gene_lengths = {}
for gene, intervals in gene_intervals.items():
    merged = merge_intervals(intervals)
    total_length = sum(end - start + 1 for start, end in merged)
    gene_lengths[gene] = total_length

# 6. Convert to pandas Series
gene_lengths = pd.Series(gene_lengths)


In [12]:
gene_lengths.to_pickle('./gene_length.pkl')