In [1]:
!pip install pymatgen

Collecting pymatgen
  Downloading pymatgen-2025.6.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting bibtexparser>=1.4.0 (from pymatgen)
  Downloading bibtexparser-1.4.3.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting monty>=2025.1.9 (from pymatgen)
  Downloading monty-2025.3.3-py3-none-any.whl.metadata (3.6 kB)
Collecting palettable>=3.3.3 (from pymatgen)
  Downloading palettable-3.3.3-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting ruamel.yaml>=0.17.0 (from pymatgen)
  Downloading ruamel.yaml-0.18.15-py3-none-any.whl.metadata (25 kB)
Collecting spglib>=2.5 (from pymatgen)
  Downloading spglib-2.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting uncertainties>=3.1.4 (from pymatgen)
  Downloading uncertainties-3.2.3-py3-none-any.whl.metadata (7.0 kB)


In [2]:
from pymatgen.core import Structure
from pymatgen.ext.cod import COD

Часть 1: Материалы

In [3]:
cod = COD()
results = cod.get_structure_by_formula("MnFe2O4")

entry = results[0]
structure = entry["structure"]

In [4]:
print("Параметры решётки (a,b,c):", structure.lattice.a, structure.lattice.b, structure.lattice.c)
print("Углы (α,β,γ):", structure.lattice.alpha, structure.lattice.beta, structure.lattice.gamma)
print("Объём ячейки:", structure.volume, "³")
print("Плотность:", structure.density, "г/см³")
print("Состав (атомы):", structure.composition.get_el_amt_dict())
print("Молярная масса:", structure.composition.weight, "г/моль")

Параметры решётки (a,b,c): 8.515 8.515 8.515
Углы (α,β,γ): 90.0 90.0 90.0
Объём ячейки: 617.3819908750002 Å³
Плотность: 4.96241094787505 g cm^-3 г/см³
Состав (атомы): {'Mn': 8.0, 'Fe': 16.0, 'O': 32.0}
Молярная масса: 1845.00516 amu г/моль


Часть 2: Последовательности ДНК/РНК

In [6]:
!pip install Bio

Collecting Bio
  Downloading bio-1.8.0-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.0-py3-none-any.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.1/321.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl (9.3

In [9]:
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction

sequences = {
    "gene1": "ATGCGTACTGAGC",
    "gene2": "TTATGGATGACTAGGCTA",
}

for name, seq_str in sequences.items():
    seq = Seq(seq_str)

    gc_content = gc_fraction(seq) * 100

    rev_comp = seq.reverse_complement()

    rna_seq = seq.transcribe()

    print(f"--- {name} ---")
    print(f"DNA:              {seq}")
    print(f"GC-content:       {gc_content:.2f}%")
    print(f"Reverse complement: {rev_comp}")
    print(f"RNA transcript:   {rna_seq}\n")

--- gene1 ---
DNA:              ATGCGTACTGAGC
GC-content:       53.85%
Reverse complement: GCTCAGTACGCAT
RNA transcript:   AUGCGUACUGAGC

--- gene2 ---
DNA:              TTATGGATGACTAGGCTA
GC-content:       38.89%
Reverse complement: TAGCCTAGTCATCCATAA
RNA transcript:   UUAUGGAUGACUAGGCUA



Часть 3: Белки

In [20]:
import requests
from io import StringIO
from Bio import SeqIO
import pandas as pd
from collections import Counter

In [16]:
import requests
from io import StringIO
from Bio import SeqIO

def fetch_uniprot_fasta(uniprot_id: str) -> str:
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    resp = requests.get(url)
    if resp.status_code == 200 and resp.text.startswith(">"):
        return resp.text
    else:
        raise ValueError(f"Не удалось получить FASTA для {uniprot_id}, статус {resp.status_code}")

uniprot_id = "P0DUB6"  # AMY1A human

fasta = fetch_uniprot_fasta(uniprot_id)
seq_record = SeqIO.read(StringIO(fasta), "fasta")

print("UniProt ID:", uniprot_id)
print("Название:", seq_record.description)
print("Длина (аминокислот):", len(seq_record.seq))
print("Первые 100 аминокислот:")
print(seq_record.seq)

UniProt ID: P0DUB6
Название: sp|P0DUB6|AMY1A_HUMAN Alpha-amylase 1A OS=Homo sapiens OX=9606 GN=AMY1A PE=1 SV=1
Длина (аминокислот): 511
Первые 100 аминокислот:
MKLFWLLFTIGFCWAQYSSNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIHNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALGKDYVRSKIAEYMNHLIDIGVAGFRIDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSSYRWPRYFENGKDVNDWVGPPNDNGVTKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQPFTNWYDNGSNQVAFGRGNRGFIVFNNDDWTFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL


In [17]:
def fetch_uniprot_fasta(uniprot_id: str) -> str:
    """
    Загружает FASTA последовательность по UniProt ID.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    resp = requests.get(url)
    if resp.status_code == 200 and resp.text.startswith(">"):
        return resp.text
    else:
        raise ValueError(f"Не удалось получить FASTA для {uniprot_id}, статус {resp.status_code}")


In [18]:
def get_sequence(uniprot_id: str) -> str:
    """
    Возвращает строку аминокислотной последовательности.
    """
    fasta = fetch_uniprot_fasta(uniprot_id)
    record = SeqIO.read(StringIO(fasta), "fasta")
    return str(record.seq)

In [21]:
def amino_acid_composition(seq: str) -> pd.DataFrame:
    """
    Подсчитывает состав аминокислот (counts, fraction, %).
    """
    length = len(seq)
    counts = Counter(seq)
    df = pd.DataFrame.from_dict(counts, orient="index", columns=["count"])
    df["fraction"] = df["count"] / length
    df["percentage"] = df["fraction"] * 100
    df = df.sort_values("count", ascending=False)
    return df

In [22]:
def dipeptide_frequencies(seq: str) -> pd.DataFrame:
    """
    Подсчитывает частоты дипептидов (counts, fraction, %).
    """
    dipeps = [seq[i:i+2] for i in range(len(seq)-1)]
    counts = Counter(dipeps)
    total = sum(counts.values())
    df = pd.DataFrame.from_dict(counts, orient="index", columns=["count"])
    df.index.name = "dipeptide"
    df["fraction"] = df["count"] / total
    df["percentage"] = df["fraction"] * 100
    df = df.sort_values("count", ascending=False)
    return df

In [25]:
uniprot_id = "P0DUB6"
seq = get_sequence(uniprot_id)
print(f"UniProt ID: {uniprot_id}")
print(f"Длина последовательности: {len(seq)} aa")

aa_df = amino_acid_composition(seq)
print("\nАминокислоты:")
print(aa_df)

dipep_df = dipeptide_frequencies(seq)
print("\nДипептиды:")
print(dipep_df)


UniProt ID: P0DUB6
Длина последовательности: 511 aa

Аминокислоты:
   count  fraction  percentage
G     52  0.101761   10.176125
N     41  0.080235    8.023483
D     35  0.068493    6.849315
V     35  0.068493    6.849315
S     33  0.064579    6.457926
F     29  0.056751    5.675147
R     28  0.054795    5.479452
I     28  0.054795    5.479452
L     27  0.052838    5.283757
A     27  0.052838    5.283757
K     24  0.046967    4.696673
T     23  0.045010    4.500978
P     22  0.043053    4.305284
Y     21  0.041096    4.109589
E     20  0.039139    3.913894
W     19  0.037182    3.718200
Q     12  0.023483    2.348337
C     12  0.023483    2.348337
H     12  0.023483    2.348337
M     11  0.021526    2.152642

Дипептиды:
           count  fraction  percentage
dipeptide                             
GF             7  0.013725    1.372549
ND             6  0.011765    1.176471
GN             6  0.011765    1.176471
NG             6  0.011765    1.176471
GS             5  0.009804    0.9803