##### Data Reshaping and Binary Operations

In [1]:
import pandas as pd
import numpy as np

##### Loading Sample Data
We'll use the air quality and titanic datasets

In [2]:
# Air quality data with datetime index
air_quality = pd.read_csv("data/air_quality_long.csv",
                         index_col="date.utc", parse_dates=True)

print("Air quality data sample:")
print(air_quality.head())

Air quality data sample:
                                city country location parameter  value   unit
date.utc                                                                     
2019-06-18 06:00:00+00:00  Antwerpen      BE  BETR801      pm25   18.0  µg/m³
2019-06-17 08:00:00+00:00  Antwerpen      BE  BETR801      pm25    6.5  µg/m³
2019-06-17 07:00:00+00:00  Antwerpen      BE  BETR801      pm25   18.5  µg/m³
2019-06-17 06:00:00+00:00  Antwerpen      BE  BETR801      pm25   16.0  µg/m³
2019-06-17 05:00:00+00:00  Antwerpen      BE  BETR801      pm25    7.5  µg/m³


In [3]:
# Titanic dataset
titanic = pd.read_csv("data/titanic.csv")

print("\nTitanic data sample:")
print(titanic.head())


Titanic data sample:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.05

##### Sorting Data
Sort DataFrame rows by one or multiple columns

In [4]:
# Sort by single column (Age)
print("Sorted by age:")
print(titanic.sort_values(by="Age").head())

Sorted by age:
     PassengerId  Survived  Pclass                             Name     Sex  \
803          804         1       3  Thomas, Master. Assad Alexander    male   
755          756         1       2        Hamalainen, Master. Viljo    male   
644          645         1       3           Baclini, Miss. Eugenie  female   
469          470         1       3    Baclini, Miss. Helene Barbara  female   
78            79         1       2    Caldwell, Master. Alden Gates    male   

      Age  SibSp  Parch  Ticket     Fare Cabin Embarked  
803  0.42      0      1    2625   8.5167   NaN        C  
755  0.67      1      1  250649  14.5000   NaN        S  
644  0.75      2      1    2666  19.2583   NaN        C  
469  0.75      2      1    2666  19.2583   NaN        C  
78   0.83      0      2  248738  29.0000   NaN        S  


In [5]:
# Sort by multiple columns (Pclass, Age) in descending order
print("Sorted by class and age (descending):")
print(titanic.sort_values(by=['Pclass', 'Age'], ascending=False).head())

Sorted by class and age (descending):
     PassengerId  Survived  Pclass                       Name     Sex   Age  \
851          852         0       3        Svensson, Mr. Johan    male  74.0   
116          117         0       3       Connors, Mr. Patrick    male  70.5   
280          281         0       3           Duane, Mr. Frank    male  65.0   
483          484         1       3     Turkula, Mrs. (Hedwig)  female  63.0   
326          327         0       3  Nysveen, Mr. Johan Hansen    male  61.0   

     SibSp  Parch  Ticket    Fare Cabin Embarked  
851      0      0  347060  7.7750   NaN        S  
116      0      0  370369  7.7500   NaN        Q  
280      0      0  336439  7.7500   NaN        Q  
483      0      0    4134  9.5875   NaN        S  
326      0      0  345364  6.2375   NaN        S  


##### Reshaping Data: Long to Wide Format
Transform data layout using pivot operations

In [6]:
# Create NO2 subset
no2 = air_quality[air_quality["parameter"] == "no2"]
no2_subset = no2.sort_index().groupby(["location"]).head(2)

print("NO2 measurements subset:")
print(no2_subset)

NO2 measurements subset:
                                city country            location parameter  \
date.utc                                                                     
2019-04-09 01:00:00+00:00  Antwerpen      BE             BETR801       no2   
2019-04-09 01:00:00+00:00      Paris      FR             FR04014       no2   
2019-04-09 02:00:00+00:00     London      GB  London Westminster       no2   
2019-04-09 02:00:00+00:00  Antwerpen      BE             BETR801       no2   
2019-04-09 02:00:00+00:00      Paris      FR             FR04014       no2   
2019-04-09 03:00:00+00:00     London      GB  London Westminster       no2   

                           value   unit  
date.utc                                 
2019-04-09 01:00:00+00:00   22.5  µg/m³  
2019-04-09 01:00:00+00:00   24.4  µg/m³  
2019-04-09 02:00:00+00:00   67.0  µg/m³  
2019-04-09 02:00:00+00:00   53.5  µg/m³  
2019-04-09 02:00:00+00:00   27.4  µg/m³  
2019-04-09 03:00:00+00:00   67.0  µg/m³  


In [7]:
# Pivot data to wide format
print("NO2 data in wide format:")
print(no2_subset.pivot(columns="location", values="value"))

NO2 data in wide format:
location                   BETR801  FR04014  London Westminster
date.utc                                                       
2019-04-09 01:00:00+00:00     22.5     24.4                 NaN
2019-04-09 02:00:00+00:00     53.5     27.4                67.0
2019-04-09 03:00:00+00:00      NaN      NaN                67.0


##### Binary Operations
Perform operations between DataFrames and Series

In [8]:
# Create sample DataFrame
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])
})

print("Sample DataFrame:")
print(df)

Sample DataFrame:
        one       two     three
a -0.057266 -1.654092       NaN
b -0.231980  2.450928 -0.096852
c -0.054669  1.790320 -1.932948
d       NaN  0.488700  0.816248


In [9]:
# Subtract row from DataFrame
row = df.iloc[1]
print("Subtracting row from DataFrame:")
print(df.sub(row, axis='columns'))

Subtracting row from DataFrame:
        one       two     three
a  0.174714 -4.105019       NaN
b  0.000000  0.000000  0.000000
c  0.177311 -0.660608 -1.836096
d       NaN -1.962228  0.913100


In [10]:
# Subtract column from DataFrame
column = df['two']
print("Subtracting column from DataFrame:")
print(df.sub(column, axis='index'))

Subtracting column from DataFrame:
        one  two     three
a  1.596825  0.0       NaN
b -2.682908  0.0 -2.547780
c -1.844989  0.0 -3.723268
d       NaN  0.0  0.327548


##### Advanced Operations
Using divmod with Series

In [11]:
# Create sample Series
s = pd.Series(np.arange(10))
print("Original Series:")
print(s)

# Perform divmod operation
div, rem = divmod(s, 3)
print("\nDivision result:")
print(div)
print("\nRemainder:")
print(rem)

Original Series:
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

Division result:
0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int64

Remainder:
0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int64
