In [None]:
"""
Pandas is a Python library for working with "tables of data"
(like excel spreadsheets but more powerful)

it has 2 main objects:
1. Series - A single column of data
2. DataFrame - A whole table (multiple columns)
"""

In [None]:
# Basic Statistics
print("\nSummary Statistics:")
print(df.describe())
print(f"\nWorkouts by Intensity:")
print(df['Intensity'].value_counts())


Summary Statistics:
        Duration                 Date       Pulse    Maxpulse    Calories
count  30.000000                   30   30.000000   30.000000   30.000000
mean   56.000000  2020-12-15 08:00:00  102.766667  130.066667  306.360000
min    30.000000  2020-12-01 00:00:00   90.000000  112.000000  195.100000
25%    60.000000  2020-12-08 06:00:00  100.000000  123.000000  251.350000
50%    60.000000  2020-12-14 12:00:00  102.500000  129.500000  295.600000
75%    60.000000  2020-12-22 12:00:00  105.750000  132.750000  343.975000
max    60.000000  2020-12-31 00:00:00  117.000000  175.000000  479.000000
std     7.812457                  NaN    6.339169   12.102816   65.155763

Workouts by Intensity:
Intensity
Medium    17
High       7
Low        6
Name: count, dtype: int64


In [None]:
# Creating Your First DataFrame

In [3]:
import pandas as pd

data_dict = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["New York", "Paris", "London"]
}

df_dict = pd.DataFrame(data_dict)
print(df_dict)

      Name  Age      City
0    Alice   25  New York
1      Bob   30     Paris
2  Charlie   35    London


In [None]:
data_list = [
    ["Apple", 1.20],
    ["Banana",0.50],
    ["Orange", 0.75]
]

df_list = pd.DataFrame(data_list, columns=["Fruit", "Price"])
print(df_list)

In [None]:
# Basic Operations

In [None]:
# Viewing Data
df_list.head(2)     # First 2 rows
df_list.tail(1)     # Last row
df_list.sample(3)   # Random 3 rows


In [None]:
# Getting Information
df_list.shape       # (rows, columns)
df_list.info()      # Data types and memory
df_list.describe()  # Stats for numeric columns


In [None]:
# Selecting Data

# Single Column
df_dict["Name"]

# Multiple Columns
df_dict[["Name", "City"]]

# Row by index
df_dict.iloc[1]     # Second row (index starts at 0)

In [27]:
# Filtering Data

In [30]:
# Basic Filter
df_dict[df_dict["Age"] > 30] # People older than 30

df_dict[df_dict["City"] == "Paris"] # People from Paris

Unnamed: 0,Name,Age,City
1,Bob,30,Paris


In [35]:
# Combining Conditions
df_dict[(df_dict["Age"] > 25) & (df_dict["City"] == "Paris")] # People older than 25 and from Paris

df_dict[(df_dict["Age"] < 30) | (df_dict["City"] == "London")] # OR condition (use |)

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,London


In [36]:
# Adding/Modifying Data

In [None]:
# New Column
df_dict["Senior"] = df_dict["Age"] > 30 # True/False Columns
print(df_dict)

In [45]:
# Modifying Values
df_dict["City"] = df_dict["City"].str.upper() # Change all cities to uppercase
print(df_dict)

      Name  Age      City  Senior
0    Alice   25  NEW YORK   False
1      Bob   30     PARIS   False
2  Charlie   35    LONDON    True


In [46]:
# Basic Data Analysis

In [52]:
# Grouping
df_dict.groupby("City")["Age"].mean() # Average age per city

City
LONDON      35.0
NEW YORK    25.0
PARIS       30.0
Name: Age, dtype: float64

In [None]:
# Sorting
df_dict.sort_values("Age", ascending=False) # Sort by age (Descending)

In [None]:
# Simple Visualization
df_dict["Age"].plot(kind="hist", title="Age Distribution") # Shows A Histogram

In [55]:
# First Project

In [64]:
sales = {
    "Product": ["Book", "Pen", "Book", "Notebook", "Pen"],
    "Price": [15, 3, 15, 8, 3],
    "Units": [2, 5, 3, 1, 4]
}

df = pd.DataFrame(sales)

# Task 1: Add "Total" Column
df["Total"] = df["Price"] * df["Units"]

# Task 2: Find all products with more than 3 units sold
print(df[df["Units"] > 3])

# Calculate total revenue per product
print(df.groupby("Product")["Total"].sum())

# Find the most popular product
print(df["Product"].value_counts().idxmax())

  Product  Price  Units  Total
1     Pen      3      5     15
4     Pen      3      4     12
Product
Book        75
Notebook     8
Pen         27
Name: Total, dtype: int64
Book


In [101]:
import pandas as pd

# Create a DataFrame from a dictionary
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["New York", "Paris", "London"]
}

df = pd.DataFrame(data)

# TASK 1: Add a new column "Salary" with values [50000, 60000, 70000]
df["Salary"] = [50000, 60000, 70000]

# TASK 2: Print just the "Name" Column
print(df["Name"])

# TASK 3: Use df.head(2) to see the first 2 rows
df.head(2)

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object


Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000
1,Bob,30,Paris,60000


In [67]:
# DATA SELECTION & FILTERING

In [77]:
# Method 1: Square brackets
names = df["Name"]

# Method 2: Dot notation (if column name has no spaces)
ages = df.Age
city = df.City

# Multiple Columns
subset = df[["Name", "City"]] # Double brackets!

In [78]:
# Selecting Rows

In [92]:
# By Index Position (iloc)

# First row (index 0)
row_0 = df.iloc[0]

# First two rows
row_0_1 = df.iloc[0:2]


# By Label (loc)

# Select row with index label 1 (second row)
row_1 = df.loc[1]

# Select specific columns for row 1
row_1_name_age = df.loc[0:2, ["Name", "Age"]]

In [93]:
# Filtering Data

In [98]:
# Basic Conditional Filter

# Peoplel older than 30
df[df["City"] == "London"]          # Directly filter
older_than_30 = df[df["Age"] > 30]  # Store the filter to variable

In [103]:
# Multiple Conditions

# People from Paris OR with salary > 55000
filtered =  df[(df["City"] == "Paris") | (df["Salary"] > 55000)]

In [None]:
# isin() Â» for multiple Values

# People in New York or London
cities_filter = df[df["City"].isin(["New York", "London"])]
print(cities_filter)

In [107]:
# SORTING DATA

In [113]:
# Sort by Age (ascending)
sorted_age = df.sort_values("Age")

# Sort by Salary (descending)
sorted_salary = df.sort_values("Salary", ascending=False)

In [114]:
# Practice Project: Employee Analysis

In [117]:
employees = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Department": ["HR", "Tech", "Tech", "Marketing"],
    "Salary": [50000,70000, 80000,60000],
    "Years": [2, 4, 5, 3]
}
emp_df = pd.DataFrame(employees)

In [125]:
# Task 1: Select all employees in the Tech department.
tech_employees = emp_df[emp_df["Department"].isin(["Tech"])]
print(tech_employees)

# Task 2: Find employees with salaries between 50k and 70k (inclusive).
emp_df[(emp_df["Salary"] >= 50000) & (emp_df["Salary"] <= 70000)]

# Task 3: Sort by years of experience (descending).
sorted_years = emp_df.sort_values("Years", ascending=False)

# Bonus
result = emp_df[
    (emp_df["Years"] < 5) &
    (emp_df["Department"] == "Tech")
    ].sort_values("Salary")[["Name", "Salary"]]


print(result)

      Name Department  Salary  Years
1      Bob       Tech   70000      4
2  Charlie       Tech   80000      5
  Name  Salary
1  Bob   70000
