# Pandas Fundamentals


In [None]:
import pandas as pd

## 1. Introduction to Pandas Objects

### 1.1 Series
A one-dimensional labeled array that can hold data of any type.


In [None]:
# Creating a Series
s = pd.Series([1, 3, 5, 7, 9], index=["a", "b", "c", "d", "e"])
s

### 1.2 DataFrame
A two-dimensional labeled data structure with columns of potentially different types.

In [None]:
# Creating a DataFrame
data = {
    "name": ["John", "Emma", "Alex", "Sarah", "Jim"],
    "gender": ["M", "F", "M", "F", None],
    "age": [25, 28, 32, 30, None],
    "city": ["New York", "London", "Paris", "Tokyo", None],
}
df = pd.DataFrame(data)

In [None]:
df.head()

## 2. Loading and Writing Data

### 2.1 Reading Data

In [None]:
# CSV files
df_csv = pd.read_csv("data/data.csv")

In [None]:
# JSON files
df_json = pd.read_json("data/data.json")

### 2.2 Writing Data

In [None]:
# Save to CSV
# df.to_csv('data/output.csv', index=False)

In [None]:
# Save to JSON
# df.to_json('data/output.json')

## 3. Basic DataFrame Operations

### 3.1 Accessing Data

In [None]:
# Column access
names = df["name"]
names

In [None]:
# Multiple columns
df[["name", "age"]]

In [None]:
# Row access by position
df.iloc[0]

In [None]:
# Row access by label
df.loc[2]

### 3.2 Data Selection and Filtering

In [None]:
# Filter by condition
df[df["age"] >= 18]

In [None]:
# Multiple conditions
df[(df["age"] >= 25) & (df["city"] == "London")]

In [None]:
# isin filtering
df[df["city"].isin(["London", "Paris"])]

## 4. Data Cleaning and Transformation

### 4.1 Handling Missing Values

In [None]:
# Check for missing values
df.isna().sum()

In [None]:
df.fillna(0)  # with zero

In [None]:
df["age"].fillna(df["age"].mean())  # with mean

In [None]:
df.ffill()  # forward fill is filling missing values with the previous value

In [None]:
df.bfill()  # backward fill is filling missing values with the next value

In [None]:
# Drop missing values
df.dropna()  # rows with any missing values
df.dropna(how="all")  # rows with all missing values

### 4.2 Data Transformation

In [None]:
# Apply function to column
df["age_squared"] = df["age"].apply(lambda x: x**2)

In [None]:
# replace city name of London to UK-London
df["city"] = df["city"].replace("London", "UK-London")

In [None]:
# String operations
df["name"] = df["name"].str.upper()

In [None]:
# Type conversion
df["age"] = df["age"].astype(float)

In [None]:
df.head()

## 5. Data Analysis Operations

### 5.1 Aggregation

In [None]:
# Basic statistics
df.describe()

In [None]:
# Group by operations
city_stats = df.groupby("city").agg({"age": ["mean", "min", "max", "count"]})

city_stats

### 5.2 Merging and Joining

In [None]:
# Create df1: Employee information
df1 = pd.DataFrame(
    {
        "id": [1, 2, 3, 4, 5],
        "name": ["John", "Emma", "Alex", "Sarah", "Mike"],
        "department": ["IT", "HR", "IT", "Finance", "HR"],
        "salary": [60000, 55000, 65000, 58000, 52000],
    }
)

# Create df2: Employee projects and locations
df2 = pd.DataFrame(
    {
        "id": [1, 2, 3, 6, 7],
        "project": ["Alpha", "Beta", "Gamma", "Delta", "Epsilon"],
        "location": ["NY", "SF", "NY", "LA", "CH"],
        "bonus": [5000, 3000, 4000, 4500, 3500],
    }
)

print("DataFrame 1 (Employee Information):")
print(df1)
print("\nDataFrame 2 (Project Information):")
print(df2)

In [None]:
# Merge DataFrames
df_merged = pd.merge(df1, df2, on="id", how="left")
print(df_merged)

In [None]:
# Concatenate DataFrames
df_combined = pd.concat([df1, df2], axis=0)  # vertically
print(df_combined)
print("\n")
df_combined = pd.concat([df1, df2], axis=1)  # horizontally
print(df_combined)

## 6. Practical Exercise

> **Exercise 1:** Create a data analysis pipeline that:
1. Loads a dataset
2. Cleans missing values
3. Performs basic transformations
4. Creates summary statistics
5. Exports the results

In [None]:
import pandas as pd

# 1. Load data
df = pd.read_csv("data/sales_data.csv")
df.head()

In [None]:
# 2. Clean data
df["revenue"] = df["revenue"].fillna(df["revenue"].mean())
df = df.dropna(subset=["customer_id"])
df.head()

In [None]:
# 3. Transform data
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year

In [None]:
# 4. Analysis
monthly_stats = (
    df.groupby(["year", "month"])
    .agg({"revenue": ["sum", "mean", "count"], "customer_id": "nunique"})
    .round(2)
)

monthly_stats.head()

In [None]:
# 5. Export results
monthly_stats.to_csv("data/sales_analysis.csv")

## Additional Resources
- [Pandas Documentation](https://pandas.pydata.org/docs/)
- [10 Minutes to Pandas](https://pandas.pydata.org/docs/user_guide/10min.html)
- [Pandas Cheat Sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)

## Next Steps
- Practice with real-world datasets
- Combine Pandas with visualization libraries
- Learn advanced Pandas operations
- Apply Pandas in machine learning preprocessing