# Numpy 

Numpy is a library for working with arrays. It's the foundation for many other libraries.

In [None]:
# Import numpy
import numpy as np

In [None]:
# Creating numpy arrays
# 1D array (like a sequence of values)
expression_levels = np.array([10.5, 15.2, 8.7, 22.1, 5.3, 18.9])
print("Expression levels:", expression_levels)
print("Type:", type(expression_levels))
print("Shape:", expression_levels.shape)

# 2D array (like a matrix - useful for multiple samples x genes)
expression_matrix = np.array([
    [10.5, 15.2, 8.7, 22.1],   # Sample 1
    [12.1, 14.8, 9.2, 20.5],   # Sample 2
    [8.9, 16.3, 7.8, 18.7]     # Sample 3
])
print("\nExpression matrix:")
print(expression_matrix)
print("Shape:", expression_matrix.shape)
print("Number of samples:", expression_matrix.shape[0])
print("Number of genes:", expression_matrix.shape[1])

# Array operations (vectorized - very fast!)
normalized = expression_levels / np.max(expression_levels)
print("\nNormalized expression levels:", normalized)

In [None]:
# Statistical operations
print("\nStatistics:")
print(f"Mean: {np.mean(expression_levels):.2f}")
print(f"Median: {np.median(expression_levels):.2f}")
print(f"Std: {np.std(expression_levels):.2f}")
print(f"Min: {np.min(expression_levels):.2f}")
print(f"Max: {np.max(expression_levels):.2f}")


In [None]:

# Boolean indexing (filtering data)
high_expression = expression_levels[expression_levels > 15]
print(f"\nHigh expression values (>15): {high_expression}")

# Reshaping arrays
vector = np.arange(12)  # 0 to 11
matrix = vector.reshape(3, 4)
print("\nReshaped array (3x4 matrix):")
print(matrix)



In [None]:
# DNA sequence as numpy array
dna_sequence = np.array(list("ATGCGATACGCTTGA"))
print("\nDNA sequence as numpy array:", dna_sequence)
print(f"Count of 'A': {np.sum(dna_sequence == 'A')}")
print(f"Count of 'G': {np.sum(dna_sequence == 'G')}")

# Random distributions (useful for simulations)
random_normal = np.random.normal(loc=100, scale=15, size=10)
print("\nRandom expression values (normal distribution):")
print(random_normal)

# pandas

pandas is a powerful data analysis library that deals with tabular data.

In [None]:
# Import pandas
import pandas as pd

In [None]:
# Create a DataFrame (like a spreadsheet or R data.frame)
# Example: Gene expression data
data = {
    'gene_name': ['TP53', 'BRCA1', 'EGFR', 'MYC', 'KRAS'],
    'chromosome': [17, 17, 7, 8, 12],
    'start_position': [7668402, 43044294, 55019017, 128748315, 25358180],
    'end_position': [7687550, 43125483, 55211628, 128753680, 25380284],
    'expression_sample1': [1500, 850, 3200, 4500, 1200],
    'expression_sample2': [1650, 920, 2800, 4100, 1350],
    'expression_sample3': [1420, 780, 3500, 4900, 1100]
}

genes_df = pd.DataFrame(data)
print("Gene expression DataFrame:")
print(genes_df)
print("\nDataFrame info:")
genes_df.info()


In [None]:
print("\nGene names column:")
print(genes_df['gene_name'])

print("\nGene names and chromosomes:")
print(genes_df[['gene_name', 'chromosome']])

# Accessing data
print("\nFirst 3 rows:")
genes_df.head(3)

In [None]:
# Filtering data
high_exp_genes = genes_df[genes_df['expression_sample1'] > 2000]
print("\nGenes with expression > 2000 in sample 1:")
print(high_exp_genes)

# Adding new columns
genes_df['expression_mean'] = genes_df[['expression_sample1', 'expression_sample2', 'expression_sample3']].mean(axis=1)
genes_df['length'] = genes_df['end_position'] - genes_df['start_position']
print("\nDataFrame with new columns:")
print(genes_df)

# Basic statistics

In [None]:



print("\nDescriptive statistics:")
print(genes_df.describe())

# Grouping and aggregation
print("\nMean expression by chromosome:")
print(genes_df.groupby('chromosome')['expression_mean'].mean())

# Working with missing data (common in bioinformatics)
# Create DataFrame with missing values
data_with_nan = {
    'sample_id': ['S1', 'S2', 'S3', 'S4', 'S5'],
    'gene_A': [10.2, 15.1, np.nan, 12.5, 18.3],
    'gene_B': [5.5, np.nan, 7.8, 6.2, 9.1],
    'gene_C': [22.1, 18.5, 20.3, np.nan, 25.0]
}

nan_df = pd.DataFrame(data_with_nan)
print("\nDataFrame with missing values:")
print(nan_df)

print("\nCount missing values per column:")
print(nan_df.isnull().sum())

# Fill missing values
filled_df = nan_df.fillna(nan_df.mean())
print("\nDataFrame with filled missing values:")
print(filled_df)


Save files

In [None]:
# Write to CSV
genes_df.to_csv('gene_data_analysis.csv', index=False)