In [None]:
# 1. The Basics 
print("Hello, World!")


In [None]:
# Simple arithmetic operation
1+1

In [None]:
# Basic math operations
print("Addition:", 5 + 3)
print("Subtraction:", 10 - 4)
print("Multiplication:", 6 * 7)
print("Division:", 20 / 5)
print("Order of operations:", (2 + 3) * 4)


In [None]:
# Numbers
num_genes = 3            #int
gene_length_bp = 1500    #int
AT_content = 0.6         #float
is_protein_coding = True #bool

# Strings
gene_name = "lacZ"
species = "E. coli"
dna = "ATGACCATGATTACGCCAAGCTAT"

# f-strings (formatted string literals) let you embed variables directly into strings using {}
description = f"{gene_name} from {species} with DNA sequence: {dna}"
print("Gene description:", description)


# Lists
genes = ["lacZ", "trpA", "recA"]
genes.append("araC")
print("Gene list:", genes)

# Dictionaries
gene_info = {
    "name": "lacZ",
    "organism": "E. coli",
    "length": 1500
}
gene_info["function"] = "beta-galactosidase"
print("Gene info:", gene_info)


In [None]:
# Type checking
print(type(description))
print(isinstance(gene_length_bp, float))
print(len(dna))

In [None]:
# Useful built-in string methods:
# - .count("A")         → count how many times a base appears
# - .find("ATG")        → find the first index of a start codon or motif
# - .upper() / .lower() → convert sequence to all uppercase/lowercase
# - .replace("A", "T")  → simulate mutations or base changes
# - .split("TAA")       → split sequence at a stop codon
# - .startswith("ATG")  → check if sequence starts with a start codon
# - .endswith("TAA")    → check if sequence ends with a stop codon

# Using .count()
dna_sample = "ATGCGTAC"
print("Number of C bases:", dna_sample.count("C"))
print("Number of G bases:", dna_sample.count("G"))

In [None]:
# Splicing: In biology, splicing is the process of removing introns (non-coding regions)
# from pre-mRNA, leaving only the exons (coding regions) to be joined together.
# In this example, we remove the middle part of a DNA string to simulate that.

dna = "ATGAAATTTCCC"

# Remove the middle part ("AAATTT") and keep only the exons: "ATG" and "CCC"
# When using string[numer1:nuber2] the positions are inclusve at the start but exclusive at the end
exon1 = dna[0:3]      # first 3 bases (ATG)
exon2 = dna[-3:]      # last 3 bases (CCC)

# Combine the exons to simulate spliced mRNA
spliced = exon1 + exon2

print("Original DNA:", dna)
print("Spliced DNA: ", spliced)


# Exercise 1: Content of A and T in a DNA sequence

Here's a short DNA sequence:
ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT

Write a program that will print out the A and T content of this DNA sequence.
Hint: you can use normal mathematical symbols like add (+), subtract (-), multiply (*), divide (/), and parentheses to carry out calculations on numbers in Python.

# Exercise 2: Replace all instances of a restriction site in a DNA sequence

## Original DNA sequence
dna = "ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT"

Simulate a restriction enzyme that cuts at the site 'TAT'

# Exercise 3: Calculate the size of DNA fragments after restriction enzyme digestion
### Here's a short DNA sequence:
ACTGATCGATTACGTATAGTAGAATTCTATCATACATATATATCGATGCGTTCAT

The sequence contains a recognition site for the EcoRI restriction enzyme, which
cuts at the motif G*AATTC (the position of the cut is indicated by an asterisk). 
Write a program which will calculate the size of the two fragments that will be
produced when the DNA sequence is digested with EcoRI.

In [None]:
# Functions
# Functions let you organize and reuse code.
# Instead of writing the same loop every time, you can define it once and call it whenever needed—
# especially helpful when working with different datasets or calling the same logic multiple times.

# FOR loop: Runs a set number of times
# This will print numbers 1 to 5

for number in range(1, 6):
    print("Number:", number)

In [None]:
# WHILE loop: Runs as long as the condition is True
# This will count up until the number reaches 5

count = 1
while count <= 5:
    print("Count is:", count)
    count += 1

# Comparison operators you can use in while loops:
# <   less than
# <=  less than or equal to
# >   greater than
# >=  greater than or equal to
# ==  equal to
# !=  not equal to

In [None]:
# Loop through a list
dna_sequences = [
    "ATGCGTA",
    "GGGTTTAAA",
    "TATAAGC",
]

# Loop through each DNA sequence in the list
for seq in dna_sequences:
    print("Sequence:", seq)


In [None]:
# Function with return: gives back a value you can save or use
def double_with_return(x):
    return x * 2

result = double_with_return(5)
print("With return:", result)  # Output: 10

# Function without return: just performs an action (prints the result)
def double_without_return(x):
    print("Without return:", x * 2)

result2 = double_without_return(5)  # Output: 10
print("Result2:", result2)  # Output: None (because it doesn't return anything)

In [None]:
# First list of sequences
sequences1 = [
    "ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT",
    "ATCGATCGATCGATCGATCGATCGA",
]

# Second list of sequences (e.g., new data from another experiment)
sequences2 = [
    "AAAATTTT",
    "GGGGCCCC",
]

# Define a function to print each DNA sequence in a list
# 'sequence_list' is a parameter — it stands in for any list of sequences you pass in when calling the function
def print_dna_sequences(sequence_list):
    for seq in sequence_list:
        print("DNA sequence:", seq)
    print("---")  # just to separate outputs

# Use the same function for both datasets
print_dna_sequences(sequences1)  # sequences1 is the argument passed into the parameter 'sequence_list'
print_dna_sequences(sequences2)  # same function, different input


# Exercise 4: Calculate and Print AT Content for Multiple DNA Sequences

* Step 1: Write a function called calculate_at_content that takes a single DNA string
        and returns its AT content (as a decimal).

* Step 2: Write a second function called print_at_for_all that takes a list of DNA sequences
        and uses calculate_at_content to print the AT content of each one.

In [None]:
# Try it with this list:
sequences = [
    "ATGCGTA",     # mix of A, T, G, C
    "AAAATTTT",    # 100% A and T
    "GGCCGGCC",    # 0% A and T
]


In [None]:
# Classes
# A class is a way to group data and actions together into one reusable unit.
# They are useful when you want to model things in the real world (like a dog, user, car, etc.)
# and give them properties (data) and behaviors (functions).

# Define the Dog class
class Dog:
    # __init__ is called automatically when you create a new Dog
    # It lets you store specific data for each dog (name, age)
    def __init__(self, name, age):
        self.name = name    # each dog gets its own name
        self.age = age      # and its own age

    # Method (behavior): what the dog can do
    def bark(self):
        print(f"{self.name} says woof!")

# Why classes are useful:
# Instead of writing separate variables and functions for every dog,
# you can create many Dog objects from this one blueprint.

# Create a dog object using the Dog class
my_dog = Dog("Buddy", 3)

# Call a method (what the dog can do)
my_dog.bark()  # Output: Buddy says woof!

# Access the dog’s data (attributes)
print("Dog's name:", my_dog.name)
print("Dog's age:", my_dog.age)

# You can make another dog too — no extra code needed!
other_dog = Dog("Luna", 5)
other_dog.bark()  # Output: Luna says woof!


In [None]:
# Classes (Biology Example)
# A class lets you group together information (like a DNA sequence) and useful actions (like getting its length or AT content).
# This makes it easier to work with multiple sequences without repeating code.

# Define the DNA class
class DNA:
    # Set up the DNA object with a sequence
    def __init__(self, sequence):
        self.sequence = sequence.upper()  # Store the sequence in uppercase for consistency

    # Method: get the length of the sequence
    def get_length(self):
        return len(self.sequence)

    # Method: calculate AT content
    def get_at_content(self):
        a_count = self.sequence.count("A")
        t_count = self.sequence.count("T")
        return (a_count + t_count) / len(self.sequence)

# Create a DNA object
my_dna = DNA("ATGCGTAAATTGCA")

# Call methods on the object
print("DNA sequence:", my_dna.sequence)
print("Length:", my_dna.get_length())
print("AT content:", round(my_dna.get_at_content(), 2))


## Exercise 5: Gene Class

You are building a simple program to store and display gene information.  
Create a class called `Gene` that represents a gene and includes the following:

1. Store the gene's `name` and `sequence` when the object is created.
2. Automatically convert the sequence to uppercase.
3. Include a method called `print_info()` that prints:
   - The gene's name
   - The gene's sequence

### Test your class:
- Create a gene called `"lacZ"` with the sequence `"atgaccgtga"`
- Call `print_info()` to display the stored gene data

**Expected Output:**
- Gene name: lacZ
- Gene sequence: ATGACCGTGA

In [None]:
# Define the Gene class


## Exercise 6: Protein Length and Classification

**Case Study:**

You are working in a genomics lab analyzing newly discovered protein sequences.  
Your goal is to help categorize proteins based on their length to determine which may require further structural analysis.

Create a class called `Protein` that helps you manage this information.

Your class should:

1. Store the `name` and `amino acid sequence` when the object is created.
2. Automatically convert the sequence to uppercase for consistency.
3. Include a method `get_length()` that returns the length of the sequence.
4. Include a method `is_candidate_for_modeling()` that returns `True` if the protein is **longer than 100 amino acids**, indicating it's a good candidate for structural modeling.

### Test your class:
- Create a short protein named `"SignalPeptide"` with the sequence `"MKTLLV"`.
- Create a long protein named `"MyosinHeavyChain"` with 150 `"M"` amino acids.
- For each protein, print:
  - Its name
  - Its length
  - Whether it is a candidate for modeling


In [None]:
# PACKAGES
# A package is a collection of pre-written code that you can use in your own programs.
# You import a package to access its tools and functions instead of writing them from scratch.

# NUMPY
# NumPy (short for "Numerical Python") is a package that helps you work with numbers and arrays efficiently.

# In case it's not installed by default you can install the package here using the following
# !pip install numpy

# Import the numpy package and give it a short name (np is the standard nickname)
import numpy as np

# Create a simple NumPy array (like a list, but better for math)
numbers = np.array([1, 2, 3, 4, 5])

# Print the array
print("Array:", numbers)

# Check the type of object (should be numpy.ndarray)
print("Type:", type(numbers))

# Check the shape (number of elements in the array)
print("Shape:", numbers.shape)

# Perform basic math operations on the array
print("Add 10 to each element:", numbers + 10)
print("Multiply each element by 2:", numbers * 2)

# Calculate the average (mean) of the array
print("Mean of the array:", numbers.mean())

# Calculate the sum of the array
print("Sum of the array:", numbers.sum())

In [None]:
# Slicing with NumPy

# Create a 3x3 array (like a table)
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("3x3 Array (Matrix):")
print(matrix)
print("---")

# You can slice NumPy arrays just like regular lists, but with more powerful features.
# For example, to get the first two rows and all columns:
sliced_matrix = matrix[:2, :]  # Get the first two rows
print("Sliced Matrix (first two rows):")
print(sliced_matrix)
print("---")

# You can also get specific rows and columns:
specific_rows = matrix[[0, 2], :]  # Get the first and third rows   
print("Specific Rows (first and third):")
print(specific_rows)
print("---")

# You can also get specific columns:
specific_columns = matrix[:, [0, 2]]  # Get the first and third columns
print("Specific Columns (first and third):")
print(specific_columns)
print("---")

# You can also perform operations on specific rows or columns:
row_sum = matrix[0, :].sum()  # Sum of the first row
print("Sum of the first row:", row_sum)


In [None]:
# Gene Expression Data with NumPy

# Imagine we have expression levels for 3 genes across 4 samples
# Rows = genes, Columns = samples
expression_data = np.array([
    [5.2, 3.8, 4.1, 6.0],  # Gene A
    [7.1, 6.3, 6.8, 7.5],  # Gene B
    [2.0, 2.4, 2.1, 1.9]   # Gene C
])

print("Gene Expression Matrix (Genes x Samples):")
print(expression_data)
print("---")

# Get expression levels for Gene A (row 0)
gene_a = expression_data[0, :]
print("Gene A expression across samples:", gene_a)
print("---")

# Get expression levels for Sample 2 (column 1)
sample_2 = expression_data[:, 1]
print("Expression levels for Sample 2:", sample_2)
print("---")

# Calculate average expression per gene
avg_per_gene = expression_data.mean(axis=1)  # axis=1 means row-wise
print("Average expression per gene:", avg_per_gene)
print("---")

# Calculate average expression per sample
avg_per_sample = expression_data.mean(axis=0)  # axis=0 means column-wise
print("Average expression per sample:", avg_per_sample)
print("---")

# Find which gene had the highest expression in Sample 4
sample_4 = expression_data[:, 3]
highest_gene_index = np.argmax(sample_4)
print("Gene with highest expression in Sample 4 is Gene", highest_gene_index)


## Exercise 7: Gene Expression Analysis with NumPy

You are analyzing gene expression levels for 3 genes across 5 different samples.  
Each row in the matrix represents a gene (Gene A, Gene B, Gene C), and each column represents a sample (Sample 1 to Sample 5).

The matrix below contains the expression levels:
```markdown
| Gene   | Sample 1 | Sample 2 | Sample 3 | Sample 4 | Sample 5 |
|--------|----------|----------|----------|----------|----------|
| Gene A |   8.1    |   7.4    |   7.8    |   8.0    |   7.9    |
| Gene B |   5.2    |   5.4    |   5.1    |   5.3    |   5.0    |
| Gene C |   3.5    |   3.7    |   3.6    |   3.4    |   3.3    |
```

### Your Tasks:

1. Print the expression levels for **Gene B**.
2. Print the expression levels for **Sample 3**.
3. Calculate the **average expression per gene**.
4. Identify which gene had the **highest expression in Sample 1**.
5. *(Optional)* Print the full expression matrix.

> Hint: Use array slicing, `.mean()`, and `np.argmax()`.

In [None]:
# !pip install pandas

In [None]:
# PANDAS
# Pandas is a Python package used for working with data in tables (like spreadsheets).
# It makes it easy to load, view, filter, and analyze structured data.

# Import the pandas package
import pandas as pd

# -------------------------------------
# Create a simple table using a dictionary
# Each key becomes a column, and the values are rows
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 22],
    "City": ["New York", "Los Angeles", "Chicago"]
}

# Convert the dictionary into a pandas DataFrame (like a table)
df = pd.DataFrame(data)

# -------------------------------------
# View the entire DataFrame
print("Full DataFrame:")
print(df)

# -------------------------------------
# View just the first few rows (good for large tables)
print("\nFirst 2 rows:")
print(df.head(2))

# -------------------------------------
# Access a single column
print("\nAges:")
print(df["Age"])

# -------------------------------------
# Filter the data: get only rows where Age is greater than 23
print("\nPeople older than 23:")
print(df[df["Age"] > 23])

# -------------------------------------
# Add a new column
df["Is_Adult"] = df["Age"] >= 18
print("\nDataFrame with new column:")
print(df)


## Exercise 8: Analyze Gene Expression Data with Pandas

You are given a small dataset showing gene expression levels for three genes across three different samples.  
Each row represents a gene, and each column represents a sample.

### Your tasks:

1. Create a pandas DataFrame using the following data:

| Gene  | Sample_1 | Sample_2 | Sample_3 |
|-------|----------|----------|----------|
| TP53  | 8.2      | 7.9      | 8.5      |
| BRCA1 | 6.5      | 6.8      | 6.2      |
| EGFR  | 9.1      | 9.0      | 9.3      |

2. Print the full DataFrame.

3. Calculate the **average expression** across all three samples for each gene.

4. Add a new column called `Average_Expression` that stores each gene's average.

5. Filter the table to show only the genes with an average expression **greater than 8.0**.

> 💡 Use `df.mean(axis=1)` to calculate averages across columns, and `df[df["column"] > value]` to filter.


In [None]:
import pandas as pd

# Step 1: Create the data dictionary
data = {
    "Gene": ["TP53", "BRCA1", "EGFR"],
    "Sample_1": [8.2, 6.5, 9.1],
    "Sample_2": [7.9, 6.8, 9.0],
    "Sample_3": [8.5, 6.2, 9.3]
}




In [None]:
# PANDAS: LOADING & CREATING TABLES

import pandas as pd

# -------------------------------------
# 1. Creating a DataFrame from scratch (manually)

# This is useful for small or simulated datasets
data = {
    "Student": ["Alice", "Bob", "Charlie"],
    "Score": [85, 92, 78],
    "Passed": [True, True, False]
}

df_manual = pd.DataFrame(data)

print("Created DataFrame:")
print(df_manual)


In [None]:
# -------------------------------------
# 2. Loading a DataFrame from a CSV file

# Make sure you have a CSV file in your working directory, or provide the full path.
# Example CSV file content:
# Name,Age,City
# Alice,25,New York
# Bob,30,Los Angeles
# Charlie,22,Chicago

# Load the CSV file into a pandas DataFrame
df_loaded = pd.read_csv("example_data.csv")  # Replace with your actual file name

print("\nLoaded DataFrame from CSV:")
print(df_loaded)

# You can use .head() to preview just the first few rows
print("\nFirst 2 rows:")
print(df_loaded.head(2))


## Exercise 9: NASA Study - How Do Plants Adapt to Spaceflight?

In real NASA experiments aboard the **International Space Station (ISS)**, scientists are exploring how plants respond to spaceflight conditions. A key question they’re now asking is:

> Are the genetic responses we observe in plants during spaceflight *actually necessary* for survival—or are they just costly side effects of stress?

### The CARA Experiment

In this study, researchers grew Arabidopsis plants in space and measured their gene expression patterns to examine how different genotypes respond to the spaceflight environment. They looked at:

- Two natural plant strains: **Col-0** and **WS**
- A genetically modified version of Col-0 missing the **PhyD gene** (called the *PhyD mutant*)

These plants were grown under two types of ISS environments:
- **GC (Growth Chamber)** – a controlled, Earth-like light setting
- **FLT (Flight Habitat)** – a more ambient, variable ISS setting

### What They Found

- In **light conditions**, both the **WS genotype** and the **PhyD mutant** showed **simpler transcriptome responses** (fewer genes turning on/off), compared to Col-0.
- This suggests that tweaking a single gene—or using a different natural genotype—might actually help the plant adapt **more efficiently** by reducing the "cost" of adaptation.
- In **dark conditions**, however, WS showed a **larger gene expression response**, suggesting that **light availability** plays a major role in how plants adapt.

### Task

You’ll work with **real RNAseq data** from this experiment (unnormalized counts), and explore:

- Which genes are expressed most in spaceflight
- How expression changes across genotypes and environments
- Whether simpler gene expression patterns might signal better adaptation

By filtering, plotting, and comparing data, you’ll contribute to a deeper understanding of **genetic adaptation in space—a real challenge for future space farming and long-term missions.**


In [None]:
import requests
import pandas as pd

# Dataset and URL
dataset_id = "OSD-120"
base_url = "https://visualization.osdr.nasa.gov/biodata/api/v2/query/data/"
query_url = f"{base_url}?id.accession={dataset_id}&file.data%20type=unnormalized%20counts"

# Step 1: Fetch data and save to file
response = requests.get(query_url)
response.raise_for_status()

filename = f"{dataset_id}_unnormalized_counts.csv"
with open(filename, "w") as f:
    f.write(response.text)

# Step 2: Read the saved file using pandas
df = pd.read_csv(filename, index_col=0)

# Show a quick preview
print("Data shape:", df.shape)
# print(df.head())

def clean_column_name(name):
    # Take just the filename portion after the last slash
    base = name.split("/")[-1]
    # Remove GSM ID and day
    parts = base.split("_")
    return "_".join(p for p in parts if not p.startswith("GSM") and not p.startswith("Day"))

# Apply to all columns except the index (which is usually gene names)
df.columns = [clean_column_name(col) for col in df.columns]

# Remove the prefix from the index values
df.index = df.index.to_series().str.extract(r'([^/]+)$')[0]

# Remove the name from the index entirely
df.index.name = None

print(df.index[:5])


In [None]:
# 0. Show the first 5 first rows 


In [None]:
# 1. Total counts per sample (column)
# Goal: See which sample has the most RNAseq reads overall.



In [None]:
# 2. Total counts per gene (row)
# Goal: Find the most highly expressed genes across all conditions.



In [None]:
# 3. Basic filtering: Genes with zero counts in all samples
# Goal: Remove unexpressed genes.



## Introduction to Matplotlib

**Matplotlib** is a powerful Python library used to create a wide variety of plots and charts. It's especially useful when working with tables of data (like Pandas DataFrames) because it helps you **visualize patterns, trends, and outliers**.

We typically use `matplotlib.pyplot`, which is a module in the library that mimics MATLAB-style plotting.

### Example Use Case

Imagine you have test scores for a student across different subjects:

```python
import matplotlib.pyplot as plt

scores = [90, 85, 70]
subjects = ['Math', 'Science', 'English']

# Plot as a bar chart
plt.bar(subjects, scores)

# Add a title
plt.title("Test Scores")

# Add a y-axis label
plt.ylabel("Score")

# Make the plot look nice
plt.tight_layout()
plt.show()


In [None]:
# 4. Plot expression of a specific gene (e.g., AT1G01010) across samples
# Goal: Visualize how expression changes across replicates and genotypes.



In [None]:
# 5. Compare mean expression of a gene across genotypes (Col-0 vs WS vs PhyD)
# Goal: Show how one gene responds in different genotypes.




In [None]:
# 6. Count how many genes are highly expressed in spaceflight: (GC) vs FLT
# Goal: Learn slicing and boolean indexing.


