In [None]:
# Week 1: Python Basics for Biologists (Simplified)

# This notebook introduces basic Python types, functions, classes,
# and essential libraries using simple biology-themed examples.

# ---
# Section 1: Types & Variables

# Numbers
num_genes = 3                # int
gene_length_bp = 1500        # float
is_protein_coding = True     # bool

# Strings
gene_name = "lacZ"
species = "E. coli"
description = f"{gene_name} from {species}"

# Lists
genes = ["lacZ", "trpA", "recA"]
genes.append("araC")

# Dictionaries
gene_info = {
    "name": "lacZ",
    "organism": "E. coli",
    "length": 1500
}
gene_info["function"] = "beta-galactosidase"

# Type checking
print(type(description))          # str
print(isinstance(gene_length_bp, float))  # True

# ---
# Section 2: Functions

def gc_content(dna):
    g = dna.count("G")
    c = dna.count("C")
    return (g + c) / len(dna)

def describe_gene(name, organism):
    return f"{name} is a gene found in {organism}."

print(gc_content("ATGCGCGTAA"))
print(describe_gene("recA", "E. coli"))

# ---
# Section 3: Classes

class SimpleGene:
    def __init__(self, name, organism, function):
        self.name = name
        self.organism = organism
        self.function = function

    def summary(self):
        return f"{self.name} ({self.organism}) - {self.function}"

my_gene = SimpleGene("araC", "E. coli", "arabinose regulator")
print(my_gene.summary())

# ---
# Section 4: NumPy Basics

import numpy as np

# Gene lengths in base pairs
gene_lengths = np.array([900, 1200, 1500, 1800])

# Indexing and broadcasting
print(gene_lengths[0])
print(gene_lengths + 100)

# Matrix example: [length, GC content]
gene_data = np.array([
    [900, 0.45],
    [1200, 0.50],
    [1500, 0.48],
    [1800, 0.52]
])

# Basic operations
print(np.mean(gene_data, axis=0))

# ---
# Section 5: pandas Basics

import pandas as pd

# Simple gene table
data = {
    "gene": ["lacZ", "trpA", "recA", "araC"],
    "length": [1023, 891, 1254, 978],
    "gc_content": [0.49, 0.46, 0.52, 0.48]
}
df = pd.DataFrame(data)

# View columns
print(df["gene"])

# Filter genes with GC content > 0.48
print(df[df["gc_content"] > 0.48])

# Summary stats
print(df.describe())

# ---
# End of Week 1 Notebook (Simplified Biology Version)


In [1]:
# ---
# Section 1: Types & Variables

# Numbers
num_genes = 3                # int
gene_length_bp = 1500        # float
is_protein_coding = True     # bool

# Strings
gene_name = "lacZ"
species = "E. coli"
description = f"{gene_name} from {species}"

# Lists
genes = ["lacZ", "trpA", "recA"]
genes.append("araC")

# Dictionaries
gene_info = {
    "name": "lacZ",
    "organism": "E. coli",
    "length": 1500
}
gene_info["function"] = "beta-galactosidase"

# Type checking
print(type(description))          # str
print(isinstance(gene_length_bp, float))  # True


<class 'str'>
False


In [2]:
# Exercise:
'''
Here's a short DNA sequence:
ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT
Write a program that will print out the AT content of this DNA sequence. Hint: you
can use normal mathematical symbols like add (+), subtract (-), multiply (*), divide 
(/) and parentheses to carry out calculations on numbers in Python. 
'''

my_dna = "ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT"
AT_count = my_dna.count("A") + my_dna.count("T")
# The formula for calculating AT content is number of A+T / length of the sequence
print("AT content of my_dna sequence is: " + str(AT_count / len(my_dna)))

AT content of my_dna sequence is: 0.6851851851851852
