In [None]:
import pandas as pd
import numpy as np

# =======================================
# 1. Загрузка данных
# =======================================
print("\n1. Загрузка данных")
df_raw = pd.read_csv('gen_info.csv', header=0, index_col=0)
df = df_raw.T

# =======================================
# 2. Удаление колонки reccurence
# =======================================
print("\n2. Удаление колонки reccurence")
if 'reccurence' in df.columns:
    df.drop(columns="reccurence", inplace=True)

# =======================================
# 3. Преобразование в long format (для базы данных)
# =======================================
print("\n3. Преобразование в long format (для базы данных)")
db_df = df.stack().reset_index()
db_df.columns = ['TCGA_ID', 'Ensembl_ID', 'Gene_Value']

# =======================================
# 4. Фильтрация ненулевых значений
# =======================================
print("\n4. Фильтрация ненулевых значений")
db_df = db_df[db_df['Gene_Value'] != 0]

# =======================================
# 5. Сохранение базы данных
# =======================================
print("\n5. Сохранение базы данных")
db_df.to_csv('tcga_ensembl_db.csv', index=False)
print("Файл tcga_ensembl_db.csv создан!")


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, PowerTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# =======================================
# 1. Загрузка и подготовка данных
# =======================================
print("\n1. Загрузка и подготовка данных")
model_df = df_raw.T
x = model_df[1:].drop(columns='reccurence')
y = model_df[1:]['reccurence']

# =======================================
# 2. Удаление полностью нулевых столбцов
# =======================================
print("\n2. Удаление полностью нулевых столбцов")
non_zero_columns_mask = (x != 0).any(axis=0)
x_filtered = x.loc[:, non_zero_columns_mask]
counter = x.shape[1] - x_filtered.shape[1]
x = x_filtered
print(f"Удалено столбцов: {counter}")

# =======================================
# 3. Нормализация данных
# =======================================
print("\n3. Нормализация данных")
transformer = PowerTransformer(method='yeo-johnson')
x_normalized = transformer.fit_transform(x)
x_norm = pd.DataFrame(data=x_normalized, columns=x.columns)

# =======================================
# 4. Отбор признаков
# =======================================
print("\n4. Отбор признаков")
k_best = SelectKBest(score_func=mutual_info_classif, k=764)
x_new = k_best.fit_transform(x_norm, y)
selected_features = x_norm.columns[k_best.get_support()]
print("Отобранные признаки:", selected_features)
kb_features = pd.DataFrame(selected_features)

# =======================================
# 5. Создание итогового DataFrame
# =======================================
print("\n5. Создание итогового DataFrame")
kb_df = pd.DataFrame(
    data=x_new,
    columns=[f'KB-{i}' for i in range(1, kb_features.size + 1)]
)
kb_df['KB-Y'] = LabelEncoder().fit_transform(y)
kb_df['TCGA_ID'] = df.index[1:]
print(kb_df.head())

# =======================================
# 6. Сохранение результата
# =======================================
print("\n6. Сохранение результата")
kb_df.to_csv('kb_df_with_tcga_id.csv', index=False)
print("Файл kb_df_with_tcga_id.csv создан!")

In [16]:
!pip install fastapi uvicorn[standard] nest-asyncio pyngrok scikit-learn joblib
!pip install python-multipart



In [38]:
import nest_asyncio
from pyngrok import ngrok
import joblib
import numpy as np
import pandas as pd
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn

In [45]:
# =======================================
# 1. Настройка ngrok
# =======================================
print("\n1. Настройка ngrok")
ngrok.set_auth_token("2wSJ5VaAqlgK9jjKEjqXJIzJjq0_iyrjVymxXRqBZi4Q7pjy")

# =======================================
# 2. Инициализация FastAPI
# =======================================
print("\n2. Инициализация FastAPI")
app = FastAPI(title="TCGA Gene Prediction API")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# =======================================
# 3. Загрузка модели и данных
# =======================================
print("\n3. Загрузка модели и данных")
model = joblib.load("gb_model.joblib")
db_df = pd.read_csv('tcga_ensembl_db.csv')  # columns: TCGA_ID, Ensembl_ID, Gene_Value
kb_df = pd.read_csv('kb_df_with_tcga_id.csv')  # columns: KB-1, ..., KB-764, KB-Y, TCGA_ID

# =======================================
# 4. Pydantic-модель для запроса
# =======================================
print("\n4. Pydantic-модель для запроса")
class IDRequest(BaseModel):
    """Модель запроса для получения предсказания по TCGA_ID и Ensembl_ID."""
    tcga_id: str
    ensembl_id: str


1. Настройка ngrok

2. Инициализация FastAPI

3. Загрузка модели и данных

4. Pydantic-модель для запроса


In [49]:
# =======================================
# 5. Endpoint для предсказания
# =======================================
print("\n5. Endpoint для предсказания")
@app.post("/predict_by_id", summary="Получить предсказание по TCGA_ID и Ensembl_ID")
async def predict_by_id(request: IDRequest):
    tcga_id = request.tcga_id
    ensembl_id = request.ensembl_id

    # 1. Значение гена
    mask = (db_df['TCGA_ID'] == tcga_id) & (db_df['Ensembl_ID'] == ensembl_id)
    if not mask.any():
        raise HTTPException(404, detail="TCGA_ID и Ensembl_ID не найдены в базе")
    gene_value = float(db_df.loc[mask, 'Gene_Value'].values[0])

    # 2. Вектор признаков
    kb_row = kb_df[kb_df['TCGA_ID'] == tcga_id]
    if kb_row.empty:
        raise HTTPException(404, detail="TCGA_ID не найден в признаках")
    features = kb_row.iloc[0, :-2].values

    if len(features) != 764:
        raise HTTPException(500, detail=f"Ожидается 764 признака, найдено {len(features)}")

    # 3. Предсказание
    input_data = np.array(features).reshape(1, -1)
    proba = float(model.predict_proba(input_data)[0, 1])
    pred_label = int(model.predict(input_data)[0])

    return {
        "tcga_id": tcga_id,
        "ensembl_id": ensembl_id,
        "gene_value": gene_value,
        "features": features.tolist(),
        "prediction": proba,
        "predicted_label": pred_label
    }


5. Endpoint для предсказания


In [50]:
from typing import List
class FeaturesRequest(BaseModel):
    features: List[float]

@app.post("/predict")
async def predict(request: FeaturesRequest):
    try:
        print(f"Received features (first 5): {request.features[:5]}")
        print(f"Total features received: {len(request.features)}")

        if len(request.features) != 764:
            error_msg = f"Expected 764 features, got {len(request.features)}"
            print(error_msg)
            raise HTTPException(status_code=400, detail=error_msg)

        input_data = np.array(request.features, dtype=np.float32).reshape(1, -1)
        print(f"Input array shape: {input_data.shape}")

        proba = float(model.predict_proba(input_data)[0, 1])
        pred_label = int(model.predict(input_data)[0])

        return {
            "status": "success",
            "prediction": proba,
            "predicted_label": pred_label,
            "features_received": len(request.features)
        }

    except Exception as e:
        print(f"Error: {str(e)}")
        raise HTTPException(500, detail=str(e))


In [51]:
class TCGARequest(BaseModel):
    tcga_id: str

@app.post("/predict_by_tcga")
async def predict_by_id(request: TCGARequest):
    tcga_id = request.tcga_id

    # 1. Получаем вектор признаков
    kb_row = kb_df[kb_df['TCGA_ID'] == tcga_id]
    if kb_row.empty:
        raise HTTPException(404, detail="TCGA_ID не найден в признаках")
    features = kb_row.iloc[0, :-2].values  # все KB-1 ... KB-764, кроме KB-Y и TCGA_ID

    if len(features) != 764:
        raise HTTPException(500, detail=f"Ожидается 764 признака, найдено {len(features)}")

    # 2. Предсказание
    input_data = np.array(features).reshape(1, -1)
    proba = float(model.predict_proba(input_data)[0, 1])
    pred_label = int(model.predict(input_data)[0])

    # 3. Ответ
    return {
        "tcga_id": tcga_id,
        "features": features.tolist(),
        "prediction": proba,
        "predicted_label": pred_label
    }

In [52]:
# =======================================
# 6. Запуск сервера
# =======================================
print("\n6. Запуск сервера")
if __name__ == "__main__":
    nest_asyncio.apply()
    public_url = ngrok.connect(8000).public_url
    print("\n🔥 Сервер доступен по URL:", public_url)
    print("📚 Документация API:", f"{public_url}/docs\n")
    uvicorn.run(app, host="0.0.0.0", port=8000)


6. Запуск сервера

🔥 Сервер доступен по URL: https://582a-34-73-147-116.ngrok-free.app
📚 Документация API: https://582a-34-73-147-116.ngrok-free.app/docs



INFO:     Started server process [279]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Received features (first 5): [-0.8607495285461352, -0.7126339504146454, -0.31005299087674104, -0.6241783457236083, -0.31616839726294566]
Total features received: 764
Input array shape: (1, 764)
INFO:     5.189.111.215:0 - "POST /predict HTTP/1.1" 200 OK




INFO:     5.189.111.215:0 - "POST /predict_by_id HTTP/1.1" 200 OK




Received features (first 5): [-0.8607495285461352, -0.7126339504146454, -0.31005299087674104, -0.6241783457236083, -0.31616839726294566]
Total features received: 764
Input array shape: (1, 764)
INFO:     5.189.111.215:0 - "POST /predict HTTP/1.1" 200 OK




Received features (first 5): [-0.8607495285461352, -0.7126339504146454, -0.31005299087674104, -0.6241783457236083, -0.31616839726294566]
Total features received: 764
Input array shape: (1, 764)
INFO:     5.189.111.215:0 - "POST /predict HTTP/1.1" 200 OK




Received features (first 5): [-0.8607495285461352, -0.7126339504146454, -0.31005299087674104, -0.6241783457236083, -0.31616839726294566]
Total features received: 764
Input array shape: (1, 764)
INFO:     5.189.111.215:0 - "POST /predict HTTP/1.1" 200 OK




INFO:     5.189.111.215:0 - "POST /predict_by_id HTTP/1.1" 200 OK




INFO:     5.189.111.215:0 - "POST /predict_by_tcga HTTP/1.1" 200 OK




Received features (first 5): [-0.8607495285461352, -0.7126339504146454, -0.31005299087674104, -0.6241783457236083, -0.31616839726294566]
Total features received: 764
Input array shape: (1, 764)
INFO:     5.189.111.215:0 - "POST /predict HTTP/1.1" 200 OK




INFO:     5.189.111.215:0 - "POST /predict_by_id HTTP/1.1" 200 OK




INFO:     5.189.111.215:0 - "POST /predict_by_tcga HTTP/1.1" 200 OK




Received features (first 5): [-0.8607495285461352, -0.7126339504146454, -0.31005299087674104, -0.6241783457236083, -0.31616839726294566]
Total features received: 764
Input array shape: (1, 764)
INFO:     5.189.111.215:0 - "POST /predict HTTP/1.1" 200 OK




INFO:     5.189.111.215:0 - "POST /predict_by_id HTTP/1.1" 200 OK




INFO:     5.189.111.215:0 - "POST /predict_by_tcga HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [279]
