##### Indexing and Data Alignment

In [1]:
import pandas as pd
import numpy as np

##### Basic Indexing Operations
Different ways to select data

In [2]:
# Create sample DataFrame
df = pd.DataFrame({
    'one': pd.Series([1., 2., 3., np.nan]),
    'flag': pd.Series([False, False, True, False]),
    'foo': 'bar'
}, index=['a', 'b', 'c', 'd'])

print("Original DataFrame:")
print(df)

print("\nSelect row by label:")
print(df.loc['b'])

print("\nSelect row by integer location:")
print(df.iloc[2])

Original DataFrame:
   one flag  foo
a  NaN  NaN  bar
b  NaN  NaN  bar
c  NaN  NaN  bar
d  NaN  NaN  bar

Select row by label:
one     NaN
flag    NaN
foo     bar
Name: b, dtype: object

Select row by integer location:
one     NaN
flag    NaN
foo     bar
Name: c, dtype: object


##### Data Alignment
Automatic alignment of DataFrame objects

In [3]:
# Create two DataFrames with different shapes
df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])

print("DataFrame 1:")
print(df1.head())
print("\nDataFrame 2:")
print(df2.head())

print("\nResult of addition (note NaN values):")
print((df1 + df2).head())

DataFrame 1:
          A         B         C         D
0  1.567921 -0.600520  0.564308 -0.030036
1  0.031780 -0.990990  0.851764 -0.024936
2  0.572579  1.058543 -1.403617 -0.795455
3 -0.345817 -2.169294 -0.825950  0.853941
4  1.997710  0.295261  0.083457  1.352197

DataFrame 2:
          A         B         C
0 -0.911836 -1.013608 -0.188633
1 -0.580276 -0.956758  0.777768
2 -1.958341 -0.972582  0.262211
3  1.642854  1.383643  0.302067
4 -0.882981 -0.339678  2.253424

Result of addition (note NaN values):
          A         B         C   D
0  0.656085 -1.614128  0.375676 NaN
1 -0.548497 -1.947748  1.629532 NaN
2 -1.385763  0.085962 -1.141405 NaN
3  1.297037 -0.785651 -0.523883 NaN
4  1.114729 -0.044417  2.336881 NaN


In [4]:
# Time series data alignment
index = pd.date_range('1/1/2000', periods=8)
df_time = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))

print("Time series DataFrame:")
print(df_time)

print("\nSubtract column A using proper axis alignment:")
print(df_time.sub(df_time['A'], axis=0))

Time series DataFrame:
                   A         B         C
2000-01-01  0.450239  0.703165  0.469230
2000-01-02  0.745005 -0.548308  0.762866
2000-01-03 -0.351441  1.086420  0.278941
2000-01-04 -0.505850  2.643367  0.004853
2000-01-05  1.643875  0.918085  0.599805
2000-01-06 -1.199307  0.473668  1.027259
2000-01-07  0.960449 -0.275233  2.097684
2000-01-08 -0.708572 -0.974419  0.468220

Subtract column A using proper axis alignment:
              A         B         C
2000-01-01  0.0  0.252926  0.018990
2000-01-02  0.0 -1.293313  0.017861
2000-01-03  0.0  1.437862  0.630382
2000-01-04  0.0  3.149217  0.510704
2000-01-05  0.0 -0.725789 -1.044070
2000-01-06  0.0  1.672975  2.226566
2000-01-07  0.0 -1.235682  1.137235
2000-01-08  0.0 -0.265847  1.176792


##### SQL-like Operations
Filtering and grouping data

In [6]:
# Load tips dataset
tips = pd.read_csv('data/tips.csv')
print("Tips dataset:")
print(tips.head())

Tips dataset:
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [7]:
# WHERE clause equivalent
print("Dinner tips:")
print(tips[tips['time'] == 'Dinner'].head())

print("\nDinner tips over $5.00:")
print(tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)])

Dinner tips:
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4

Dinner tips over $5.00:
     total_bill    tip     sex smoker  day    time  size
23        39.42   7.58    Male     No  Sat  Dinner     4
44        30.40   5.60    Male     No  Sun  Dinner     4
47        32.40   6.00    Male     No  Sun  Dinner     4
52        34.81   5.20  Female     No  Sun  Dinner     4
59        48.27   6.73    Male     No  Sat  Dinner     4
116       29.93   5.07    Male     No  Sun  Dinner     4
155       29.85   5.14  Female     No  Sun  Dinner     5
170       50.81  10.00    Male    Yes  Sat  Dinner     3
172        7.25   5.15    Male    Yes  Sun  Dinner     2
181       23.33   5.65    Male    Yes  Sun  Dinner     2
183       2

In [9]:
# NULL checking
frame = pd.DataFrame({
    'col1': ['A', 'B', np.nan, 'C', 'D'],
    'col2': ['F', np.nan, 'G', 'H', 'I']
})

print("Original DataFrame:")
print(frame)

print("\nRows where col2 is NULL:")
print(frame[frame['col2'].isna()])

print("\nRows where col1 is NOT NULL:")
print(frame[frame['col1'].notna()])

Original DataFrame:
  col1 col2
0    A    F
1    B  NaN
2  NaN    G
3    C    H
4    D    I

Rows where col2 is NULL:
  col1 col2
1    B  NaN

Rows where col1 is NOT NULL:
  col1 col2
0    A    F
1    B  NaN
3    C    H
4    D    I


In [10]:
# GROUP BY operations
print("Count of tips by sex:")
print(tips.groupby('sex').size())

print("\nAverage tip by sex and time:")
print(tips.groupby(['sex', 'time'])['tip'].mean())

Count of tips by sex:
sex
Female     87
Male      157
dtype: int64

Average tip by sex and time:
sex     time  
Female  Dinner    3.002115
        Lunch     2.582857
Male    Dinner    3.144839
        Lunch     2.882121
Name: tip, dtype: float64
