In [1]:
import pandas as pd

##### DataFrame Basics
A 2D data structure for different data types, similar to spreadsheets or SQL tables

In [2]:
# Create sample DataFrame
df = pd.DataFrame({
    "Name": ["Braund, Mr. Owen Harris",
             "Allen, Mr. William Henry",
             "Bonnell, Miss. Elizabeth"],
    "Age": [22, 35, 58],
    "Sex": ["male", "male", "female"]
})

In [3]:
# Display DataFrame structure
print("DataFrame:")
print(df)

print("\nDataFrame info:")
print(df.info())

DataFrame:
                       Name  Age     Sex
0   Braund, Mr. Owen Harris   22    male
1  Allen, Mr. William Henry   35    male
2  Bonnell, Miss. Elizabeth   58  female

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   Sex     3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes
None


##### Working with Series
Single columns from DataFrames are Series objects

In [4]:
# Extract Series from DataFrame
age_series = df["Age"]
print("Age Series from DataFrame:")
print(age_series)

Age Series from DataFrame:
0    22
1    35
2    58
Name: Age, dtype: int64


In [5]:
# Create standalone Series
ages = pd.Series([22, 35, 58], name="Age")
print("Standalone Series:")
print(ages)

Standalone Series:
0    22
1    35
2    58
Name: Age, dtype: int64


##### Series Truth Values and Gotchas

In [6]:
# Incorrect boolean operation
try:
    if pd.Series([False, True, False]):
        print("I was true")
except ValueError as e:
    print(f"Error: {e}")

Error: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


In [7]:
# Correct boolean operations
s = pd.Series([False, True, False])
print("Is empty?", s.empty)
print("Any True?", s.any())
print("All True?", s.all())

Is empty? False
Any True? True
All True? False


##### Basic Statistics

In [8]:
# Maximum values
print("Max age (DataFrame):", df["Age"].max())
print("Max age (Series):", ages.max())

Max age (DataFrame): 58
Max age (Series): 58


In [9]:
# Descriptive statistics
print("Statistics summary:")
print(df.describe())

Statistics summary:
             Age
count   3.000000
mean   38.333333
std    18.230012
min    22.000000
25%    28.500000
50%    35.000000
75%    46.500000
max    58.000000


##### Titanic Dataset Analysis

In [11]:
# Load dataset
titanic = pd.read_csv("data/titanic.csv")

# View first rows
print("First rows:")
print(titanic.head())

First rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN  

In [12]:
# Passenger class analysis
print("Unique classes:", titanic["Pclass"].unique())
print("\nClass is categorical, not numerical")

Unique classes: [3 1 2]

Class is categorical, not numerical


##### Grouping Operations

In [13]:
# Mean fare by groups
print("Mean fare by sex and class:")
print(titanic.groupby(["Sex", "Pclass"])["Fare"].mean())

Mean fare by sex and class:
Sex     Pclass
female  1         106.125798
        2          21.970121
        3          16.118810
male    1          67.226127
        2          19.741782
        3          12.661633
Name: Fare, dtype: float64


In [14]:
# Counting methods
print("Using value_counts():")
print(titanic["Pclass"].value_counts())

print("\nUsing groupby():")
print(titanic.groupby("Pclass")["Pclass"].count())

Using value_counts():
Pclass
3    491
1    216
2    184
Name: count, dtype: int64

Using groupby():
Pclass
1    216
2    184
3    491
Name: Pclass, dtype: int64


In [15]:
# NaN handling
print("Size vs Count:")
print("With NaN:", titanic.groupby("Pclass").size())
print("Without NaN:", titanic.groupby("Pclass").count())

Size vs Count:
With NaN: Pclass
1    216
2    184
3    491
dtype: int64
Without NaN:         PassengerId  Survived  Name  Sex  Age  SibSp  Parch  Ticket  Fare  \
Pclass                                                                      
1               216       216   216  216  186    216    216     216   216   
2               184       184   184  184  173    184    184     184   184   
3               491       491   491  491  355    491    491     491   491   

        Cabin  Embarked  
Pclass                   
1         176       214  
2          16       184  
3          12       491  


##### Air Quality Analysis

In [18]:
# Load data
air_quality = pd.read_csv("data/air_quality_long.csv")

print("First rows:")
print(air_quality.head())

First rows:
        city country                   date.utc location parameter  value  \
0  Antwerpen      BE  2019-06-18 06:00:00+00:00  BETR801      pm25   18.0   
1  Antwerpen      BE  2019-06-17 08:00:00+00:00  BETR801      pm25    6.5   
2  Antwerpen      BE  2019-06-17 07:00:00+00:00  BETR801      pm25   18.5   
3  Antwerpen      BE  2019-06-17 06:00:00+00:00  BETR801      pm25   16.0   
4  Antwerpen      BE  2019-06-17 05:00:00+00:00  BETR801      pm25    7.5   

    unit  
0  µg/m³  
1  µg/m³  
2  µg/m³  
3  µg/m³  
4  µg/m³  


In [21]:
# City analysis
print("Mean by city and parameter:")
print(air_quality.groupby(["city", "parameter"])["value"].mean())

Mean by city and parameter:
city       parameter
Antwerpen  no2          26.950920
           pm25         23.169492
London     no2          29.740050
           pm25         13.443568
Paris      no2          29.374284
Name: value, dtype: float64


In [23]:
# Station analysis
print("Measurements per location:")
print(air_quality["location"].value_counts())

print("\nMeasurements by parameter:")
print(air_quality["parameter"].value_counts())

Measurements per location:
location
London Westminster    3256
FR04014               1676
BETR801                340
Name: count, dtype: int64

Measurements by parameter:
parameter
no2     3447
pm25    1825
Name: count, dtype: int64
