Index Objects

In [1]:
import pandas as pd

# Creating an index
index = pd.Index(['a', 'b', 'c', 'd', 'e'])

# Index properties
print(index.size)       # Number of elements
print(index.shape)      # Shape
print(index.is_unique)  # Check if all values are unique

# Index operations
print(index[1:3])       # Slicing
print('b' in index)     # Membership check

5
(5,)
True
Index(['b', 'c'], dtype='object')
True


Data Structure of Pandas

In [5]:
# Creating a Series
import numpy as np
s = pd.Series([1, 3, 5, np.nan, 6, 8])

# With custom index
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])

# Series operations
print(s * 2)            # Scalar operation
print(s[s > 15])        # Boolean indexing
print(s.to_numpy())     # Convert to numpy array

a    20
b    40
c    60
dtype: int64
b    20
c    30
dtype: int64
[10 20 30]


DataFrame

In [6]:
# Creating a DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': pd.Timestamp('20230101'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})

# From numpy array
df = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'))

# DataFrame operations
print(df.head(2))       # First 2 rows
print(df.describe())    # Summary statistics
print(df.T)            # Transpose

          A         B         C         D
0 -0.822252 -0.344498 -1.754457 -0.040676
1 -0.979961  0.528483  0.571893 -1.005681
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.237529 -0.288032 -0.067970 -0.484384
std    1.023665  1.056919  1.299066  0.784746
min   -0.979961 -1.641353 -1.754457 -1.446076
25%   -0.802146 -0.798462 -1.115945 -0.968022
50%   -0.710073 -0.563920  0.524635 -0.580452
75%   -0.104057  0.310238  0.618151 -0.106971
max    1.709823  1.316019  1.310849  0.747035
          0         1         2         3         4         5
A -0.822252 -0.979961  1.709823 -0.678319  0.087364 -0.741827
B -0.344498  0.528483 -0.803503  1.316019 -1.641353 -0.783342
C -1.754457  0.571893  1.310849  0.633570 -1.647052  0.477378
D -0.040676 -1.005681 -0.855045 -0.305858 -1.446076  0.747035


Reindex

In [7]:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])

# Reindexing
s2 = s.reindex(['a', 'b', 'e', 'f'])

# With fill value
s3 = s.reindex(['a', 'b', 'e', 'f'], fill_value=0)

# For DataFrames
df = pd.DataFrame(np.random.randn(3, 2), columns=['A', 'B'], index=[1, 3, 5])
df2 = df.reindex([1, 2, 3, 4, 5])

# Forward fill/backfill
df3 = df.reindex(range(6), method='ffill')

Drop Entry

In [8]:
s = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

# Drop by index label
s.drop(['b', 'c'])

# For DataFrames
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                  columns=['one', 'two', 'three'])

# Drop rows
df.drop(['a', 'c'])

# Drop columns
df.drop('one', axis=1)
df.drop(['one', 'three'], axis='columns')

Unnamed: 0,two
a,-0.711253
b,-0.045556
c,0.780232
d,-2.239657
e,-0.684895


Selecting Entries

In [9]:
df = pd.DataFrame(np.random.randn(6, 4), 
                  index=pd.date_range('20230101', periods=6),
                  columns=['A', 'B', 'C', 'D'])

# Column selection
df['A']            # Single column as Series
df[['A', 'B']]     # Multiple columns as DataFrame

# Row selection by label
df.loc['20230103']

# Row selection by position
df.iloc[3]

# Slicing rows
df[0:3]            # First 3 rows
df['20230102':'20230104']  # By label

# Boolean indexing
df[df['A'] > 0]
df[(df['A'] > 0) & (df['B'] < 0)]

Unnamed: 0,A,B,C,D
2023-01-01,1.923161,-0.731414,-0.189013,2.001857
2023-01-04,0.417123,-0.800594,0.971795,1.721681
2023-01-05,0.736288,-1.145231,0.384655,0.609529
2023-01-06,0.962579,-0.678976,-0.515904,-0.469094


Data Alignment

In [10]:
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6], index=['b', 'c', 'd'])

# Addition aligns by index
print(s1 + s2)
# a    NaN
# b    6.0
# c    8.0
# d    NaN

a    NaN
b    6.0
c    8.0
d    NaN
dtype: float64


Rank and Sort

In [11]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [5, 4, 3, 2, 1],
    'C': [2, 2, 2, 2, 2]
})

# Sorting by values
df.sort_values(by='B')

# Sorting by index
df.sort_index(ascending=False)

# Ranking
df['A_rank'] = df['A'].rank(ascending=False)

loc and iloc indexing

In [13]:
df = pd.DataFrame(np.random.randn(6, 4), 
                  index=list('abcdef'),
                  columns=['one', 'two', 'three', 'four'])

# loc - label-based
df.loc['a']                  # Row with index 'a'
df.loc[:, 'one']              # Column 'one'
df.loc[['a', 'b'], ['one', 'two']]  # Rows a,b and columns one,two
df.loc['a':'c', 'two':'four'] # Slicing with labels

# iloc - position-based
df.iloc[3]                    # 4th row
df.iloc[:, 1]                 # 2nd column
df.iloc[1:3, 0:2]             # Rows 2-3, columns 1-2
df.iloc[[1, 3], [0, 2]]       # Rows 2 and 4, columns 1 and 3

Unnamed: 0,one,three
b,-0.693557,-0.718832
d,1.614604,-0.307616


Summary Statistics

In [14]:
df = pd.DataFrame(np.random.randn(10, 4), 
                  columns=['A', 'B', 'C', 'D'])

# Basic statistics
print(df.mean())        # Column means
print(df.mean(axis=1))  # Row means
print(df.std())         # Standard deviation
print(df.describe())    # Full summary

# Correlation and covariance
print(df.corr())
print(df.cov())

# Count unique values
print(df['A'].value_counts(bins=5))  # Count in 5 bins

A   -0.364343
B    0.484523
C   -0.277751
D    0.112308
dtype: float64
0    0.200097
1   -0.402209
2   -0.560756
3   -0.350240
4    0.031318
5   -0.144040
6    0.448951
7    0.707982
8    0.307867
9   -0.352129
dtype: float64
A    1.084702
B    0.625824
C    1.083618
D    0.612491
dtype: float64
               A          B          C          D
count  10.000000  10.000000  10.000000  10.000000
mean   -0.364343   0.484523  -0.277751   0.112308
std     1.084702   0.625824   1.083618   0.612491
min    -1.996594  -0.564977  -2.177590  -0.704339
25%    -1.155231   0.070218  -0.664425  -0.354701
50%    -0.306740   0.536037  -0.149464   0.173742
75%     0.567558   0.875897   0.693065   0.237733
max     0.984941   1.537429   0.816677   1.254708
          A         B         C         D
A  1.000000 -0.069689 -0.067509  0.675459
B -0.069689  1.000000 -0.536479  0.200428
C -0.067509 -0.536479  1.000000 -0.284203
D  0.675459  0.200428 -0.284203  1.000000
          A         B         C         D
A

Handling Missing Data

In [15]:
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [10, 20, 30, 40]
})

# Detecting missing values
print(df.isna())
print(df.isna().sum())

# Filling missing values
print(df.fillna(0))                  # Fill with 0
print(df.fillna(df.mean()))          # Fill with column mean
print(df.fillna(method='ffill'))     # Forward fill

# Dropping missing values
print(df.dropna())                   # Drop rows with any NA
print(df.dropna(how='all'))          # Drop rows with all NA
print(df.dropna(axis=1))             # Drop columns with any NA

       A      B      C
0  False  False  False
1  False   True  False
2   True   True  False
3  False  False  False
A    1
B    2
C    0
dtype: int64
     A    B   C
0  1.0  5.0  10
1  2.0  0.0  20
2  0.0  0.0  30
3  4.0  8.0  40
          A    B   C
0  1.000000  5.0  10
1  2.000000  6.5  20
2  2.333333  6.5  30
3  4.000000  8.0  40
     A    B   C
0  1.0  5.0  10
1  2.0  5.0  20
2  2.0  5.0  30
3  4.0  8.0  40
     A    B   C
0  1.0  5.0  10
3  4.0  8.0  40
     A    B   C
0  1.0  5.0  10
1  2.0  NaN  20
2  NaN  NaN  30
3  4.0  8.0  40
    C
0  10
1  20
2  30
3  40


Index Hierarchy

In [16]:
# Creating a MultiIndex
arrays = [
    ['A', 'A', 'B', 'B'],
    [1, 2, 1, 2]
]
index = pd.MultiIndex.from_arrays(arrays, names=('letters', 'numbers'))
df = pd.DataFrame(np.random.randn(4, 2), index=index, columns=['data1', 'data2'])

# Selecting with MultiIndex
print(df.loc['A'])                  # All A's
print(df.loc[('A', 1)])             # Specific A,1
print(df.xs(1, level='numbers'))    # All where numbers=1

# Stacking and unstacking
stacked = df.stack()
unstacked = stacked.unstack()

            data1     data2
numbers                    
1       -0.035118  0.699491
2       -0.859225 -0.264934
data1   -0.035118
data2    0.699491
Name: (A, 1), dtype: float64
            data1     data2
letters                    
A       -0.035118  0.699491
B        0.568647  0.688901
