### Advanced Pandas Learning Guide

In [1]:
import pandas as pd
import numpy as np

## 1. Creating and Exploring DataFrames

In [2]:
print("--- Creating a DataFrame ---")
data = {
    "Gene": ["BRCA1", "TP53", "MYC", "EGFR", "VEGF"],
    "Expression Level": [23.4, 45.2, 10.1, 30.5, 18.3],
    "Mutation": ["Yes", "No", "Yes", "No", "Yes"]
}
df = pd.DataFrame(data)
print(df)

--- Creating a DataFrame ---
    Gene  Expression Level Mutation
0  BRCA1              23.4      Yes
1   TP53              45.2       No
2    MYC              10.1      Yes
3   EGFR              30.5       No
4   VEGF              18.3      Yes


## 2. Handling Missing Data

In [4]:
print("\n--- Handling Missing Data ---")
df.loc[2, "Expression Level"] = np.nan
print("DataFrame with Missing Value:\n", df)
df_filled = df.fillna(df["Expression Level"].mean())
print("DataFrame after Filling Missing Value:\n", df_filled)


--- Handling Missing Data ---
DataFrame with Missing Value:
     Gene  Expression Level Mutation
0  BRCA1              23.4      Yes
1   TP53              45.2       No
2    MYC               NaN      Yes
3   EGFR              30.5       No
4   VEGF              18.3      Yes
DataFrame after Filling Missing Value:
     Gene  Expression Level Mutation
0  BRCA1             23.40      Yes
1   TP53             45.20       No
2    MYC             29.35      Yes
3   EGFR             30.50       No
4   VEGF             18.30      Yes


## 3. Filtering and Querying Data

In [5]:
print("\n--- Filtering Data ---")
high_expression = df[df["Expression Level"] > 20]
print("Genes with High Expression:\n", high_expression)


--- Filtering Data ---
Genes with High Expression:
     Gene  Expression Level Mutation
0  BRCA1              23.4      Yes
1   TP53              45.2       No
3   EGFR              30.5       No


## 4. Grouping and Aggregation

In [6]:
print("\n--- Grouping and Aggregation ---")
grouped = df.groupby("Mutation")["Expression Level"].mean()
print("Mean Expression Level by Mutation Status:\n", grouped)


--- Grouping and Aggregation ---
Mean Expression Level by Mutation Status:
 Mutation
No     37.85
Yes    20.85
Name: Expression Level, dtype: float64


## 5. Merging and Joining DataFrames

In [7]:
print("\n--- Merging DataFrames ---")
mutation_info = pd.DataFrame({
    "Gene": ["BRCA1", "TP53", "MYC", "EGFR", "VEGF"],
    "Mutation Type": ["Missense", "Nonsense", "Frameshift", "Silent", "Insertion"]
})
merged_df = df.merge(mutation_info, on="Gene", how="left")
print("Merged DataFrame:\n", merged_df)


--- Merging DataFrames ---
Merged DataFrame:
     Gene  Expression Level Mutation Mutation Type
0  BRCA1              23.4      Yes      Missense
1   TP53              45.2       No      Nonsense
2    MYC               NaN      Yes    Frameshift
3   EGFR              30.5       No        Silent
4   VEGF              18.3      Yes     Insertion


## 6. Applying Functions with Apply

In [9]:
print("\n--- Applying Functions ---")
df["Expression Normalized"] = df["Expression Level"].apply(lambda x: x / df["Expression Level"].max())
print("DataFrame with Normalized Expression Levels:\n", df)


--- Applying Functions ---
DataFrame with Normalized Expression Levels:
     Gene  Expression Level Mutation  Expression Normalized
0  BRCA1              23.4      Yes               0.517699
1   TP53              45.2       No               1.000000
2    MYC               NaN      Yes                    NaN
3   EGFR              30.5       No               0.674779
4   VEGF              18.3      Yes               0.404867


## 7. Bioinformatics Example: Analyzing Genomic Data

In [11]:
print("\n--- Bioinformatics Example: Analyzing Genomic Data ---")
genomic_data = {
    "Sample": ["S1", "S2", "S3", "S4", "S5"],
    "Chromosome": [1, 2, 1, 3, 2],
    "Gene": ["BRCA1", "TP53", "MYC", "EGFR", "VEGF"],
    "Mutation": ["Yes", "No", "Yes", "No", "Yes"],
    "Expression": [12.5, 40.2, 18.7, 32.1, 25.5]
}
genomic_df = pd.DataFrame(genomic_data)
chromosome_summary = genomic_df.groupby("Chromosome")["Expression"].mean()
print("Mean Expression by Chromosome:\n", chromosome_summary)


--- Bioinformatics Example: Analyzing Genomic Data ---
Mean Expression by Chromosome:
 Chromosome
1    15.60
2    32.85
3    32.10
Name: Expression, dtype: float64
