# Example pipeline using breast samples.

Import libraries

In [None]:
from pathlib import Path
import random

from knowseqpy import (calculate_gene_expression_values, counts_to_dataframe, degs_extraction, get_genes_annotation,
                       rna_seq_qa)
from knowseqpy.batch_effect import sva
from knowseqpy.classifiers import knn
from knowseqpy.feature_selection import discriminant_analysis
from knowseqpy.utils import plot_boxplot, plot_confusion_matrix, plot_samples_heatmap

Set seed for reproducible results

In [2]:
random.seed(1234)

Set and read paths

In [None]:
script_path = Path(__file__).resolve().parent.parent
info_path = script_path / "tests" / "test_fixtures" / "samples_info_breast.csv"
counts_path = script_path / "tests" / "test_fixtures" / "count_files_breast"

Load and preprocess count files to create a counts df

In [None]:
counts, labels = counts_to_dataframe(info_path=info_path, counts_path=counts_path)

Number of samples per class to understand the dataset's distribution

In [None]:
print(labels.value_counts())

Annotate genes: Fetch gene annotations for the genes in the dataset

In [None]:
gene_annotation = get_genes_annotation(values=counts.index)

Normalize and calculate expression values from counts

In [None]:
gene_expression = calculate_gene_expression_values(counts=counts, gene_annotation=gene_annotation)