In [2]:
# tutorial link: https://pandas.pydata.org/docs/user_guide/10min.html
import numpy as np
import pandas as pd

In [5]:
# there are two types of data structures in pandas
# two classes handle those
# Series for 1D data
# DataFrame for 2D data
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [7]:
# creating dataframe by passing a NumPy array with datetime index using date_range() and labeled colums
dates = pd.date_range("20250820", periods=6)
print(dates)

DatetimeIndex(['2025-08-20', '2025-08-21', '2025-08-22', '2025-08-23',
               '2025-08-24', '2025-08-25'],
              dtype='datetime64[ns]', freq='D')


In [10]:
# makeing this a dataframe
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
print(df)

                   A         B         C         D
2025-08-20 -1.205903 -0.511511  0.651474 -2.123320
2025-08-21  0.280346 -0.972048  0.208373 -0.448050
2025-08-22  0.638818 -0.222277 -1.308022  0.708640
2025-08-23  0.242789  0.758159  0.769651  0.492938
2025-08-24  0.053046 -1.541530 -0.959791 -1.181696
2025-08-25  2.399643  0.332095 -0.217245  0.863972


In [17]:
# creating a DataFrame by passing a dictionary of objects where the keys are colum lebels and the values are the column values
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20250820"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3]*4, dtype="int32"),
        "Copy": np.array(["bla"]*4, dtype="str"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
        "G": 3
    }
)
print(df2)

     A          B    C  D Copy      E    F  G
0  1.0 2025-08-20  1.0  3  bla   test  foo  3
1  1.0 2025-08-20  1.0  3  bla  train  foo  3
2  1.0 2025-08-20  1.0  3  bla   test  foo  3
3  1.0 2025-08-20  1.0  3  bla  train  foo  3


In [18]:
# column of the DataFrame has different datatypes
print(df2.dtypes)

A             float64
B       datetime64[s]
C             float32
D               int32
Copy           object
E            category
F              object
G               int64
dtype: object


In [20]:
# Viewing data
# Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame
print(df.head())
print(df.tail(3))

                   A         B         C         D
2025-08-20 -1.205903 -0.511511  0.651474 -2.123320
2025-08-21  0.280346 -0.972048  0.208373 -0.448050
2025-08-22  0.638818 -0.222277 -1.308022  0.708640
2025-08-23  0.242789  0.758159  0.769651  0.492938
2025-08-24  0.053046 -1.541530 -0.959791 -1.181696
                   A         B         C         D
2025-08-23  0.242789  0.758159  0.769651  0.492938
2025-08-24  0.053046 -1.541530 -0.959791 -1.181696
2025-08-25  2.399643  0.332095 -0.217245  0.863972


In [23]:
# Display DataFrame.index or DataFrame.columns
print(df.index)
print(df.columns)

DatetimeIndex(['2025-08-20', '2025-08-21', '2025-08-22', '2025-08-23',
               '2025-08-24', '2025-08-25'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [24]:
# return to numpy representation of the underlying data with DataFrame.to_numpy() without the index or column labels
numpy_arr = df.to_numpy()
print(numpy_arr)

[[-1.20590322 -0.51151133  0.65147378 -2.1233197 ]
 [ 0.28034642 -0.97204846  0.20837329 -0.44805023]
 [ 0.63881809 -0.22227715 -1.30802215  0.70863977]
 [ 0.2427893   0.75815939  0.7696507   0.49293783]
 [ 0.05304599 -1.54153018 -0.95979128 -1.18169561]
 [ 2.39964288  0.33209536 -0.21724545  0.86397197]]


In [28]:
# NumPy arrays have one dtype for the entire array while pandas DataFrames have one dtype per column.
# When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. 
# If the common data type is object, DataFrame.to_numpy() will require copying data.

# for example df2 holds different data types
numpy_arr_from_df2 = df2.to_numpy()
print(numpy_arr_from_df2)
df2.to_numpy() # notice data type is object

[[1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'test' 'foo' 3]
 [1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'train' 'foo' 3]
 [1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'test' 'foo' 3]
 [1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'train' 'foo' 3]]


array([[1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'test',
        'foo', 3],
       [1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'train',
        'foo', 3],
       [1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'test',
        'foo', 3],
       [1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'train',
        'foo', 3]], dtype=object)

In [30]:
# DataFrame.describe() shows quick statistic summary of data
print(df.describe())

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.401457 -0.359519 -0.142594 -0.281253
std    1.165670  0.841643  0.850590  1.193535
min   -1.205903 -1.541530 -1.308022 -2.123320
25%    0.100482 -0.856914 -0.774155 -0.998284
50%    0.261568 -0.366894 -0.004436  0.022444
75%    0.549200  0.193502  0.540699  0.654714
max    2.399643  0.758159  0.769651  0.863972


In [39]:
# Transposing data
df.T

Unnamed: 0,2025-08-20,2025-08-21,2025-08-22,2025-08-23,2025-08-24,2025-08-25
A,-1.205903,0.280346,0.638818,0.242789,0.053046,2.399643
B,-0.511511,-0.972048,-0.222277,0.758159,-1.54153,0.332095
C,0.651474,0.208373,-1.308022,0.769651,-0.959791,-0.217245
D,-2.12332,-0.44805,0.70864,0.492938,-1.181696,0.863972


In [48]:
# DataFrame.sort_index() shorts by an axis
df.sort_index(axis=0, ascending=True) # axis = 0 represents row and 1 represent column

Unnamed: 0,A,B,C,D
2025-08-20,-1.205903,-0.511511,0.651474,-2.12332
2025-08-21,0.280346,-0.972048,0.208373,-0.44805
2025-08-22,0.638818,-0.222277,-1.308022,0.70864
2025-08-23,0.242789,0.758159,0.769651,0.492938
2025-08-24,0.053046,-1.54153,-0.959791,-1.181696
2025-08-25,2.399643,0.332095,-0.217245,0.863972


In [49]:
# DataFrame.sort_values() sorts by values:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2025-08-23,0.242789,0.758159,0.769651,0.492938
2025-08-25,2.399643,0.332095,-0.217245,0.863972
2025-08-22,0.638818,-0.222277,-1.308022,0.70864
2025-08-20,-1.205903,-0.511511,0.651474,-2.12332
2025-08-21,0.280346,-0.972048,0.208373,-0.44805
2025-08-24,0.053046,-1.54153,-0.959791,-1.181696


In [51]:
# Getitem ([])
# For a DataFrame, passing a single lebel selects a coluns and yields a Series equivalent to df.A
df["A"]

2025-08-20   -1.205903
2025-08-21    0.280346
2025-08-22    0.638818
2025-08-23    0.242789
2025-08-24    0.053046
2025-08-25    2.399643
Freq: D, Name: A, dtype: float64

In [56]:
# For a DataFrame, passing a slice : selects matching rows
df[1:3]

Unnamed: 0,A,B,C,D
2025-08-21,0.280346,-0.972048,0.208373,-0.44805
2025-08-22,0.638818,-0.222277,-1.308022,0.70864


In [57]:
df["20250820":"20250822"]

Unnamed: 0,A,B,C,D
2025-08-20,-1.205903,-0.511511,0.651474,-2.12332
2025-08-21,0.280346,-0.972048,0.208373,-0.44805
2025-08-22,0.638818,-0.222277,-1.308022,0.70864


In [58]:
# Selection by label
# Selecting a row matching a label
df.loc[dates[0]]

A   -1.205903
B   -0.511511
C    0.651474
D   -2.123320
Name: 2025-08-20 00:00:00, dtype: float64

In [59]:
# Selecting all rows (:) with a select column labels
df.loc[:,["A", "C"]]

Unnamed: 0,A,C
2025-08-20,-1.205903,0.651474
2025-08-21,0.280346,0.208373
2025-08-22,0.638818,-1.308022
2025-08-23,0.242789,0.769651
2025-08-24,0.053046,-0.959791
2025-08-25,2.399643,-0.217245


In [60]:
# selecting both labels
df.loc["20250820":"20250822", ["A", "B"]]

Unnamed: 0,A,B
2025-08-20,-1.205903,-0.511511
2025-08-21,0.280346,-0.972048
2025-08-22,0.638818,-0.222277


In [61]:
# Selecting a single row and column returns a scalar
df.loc[dates[0],"A"]

np.float64(-1.2059032158234453)

In [62]:
# for getting fast access to a scalar
df.at[dates[0],"A"]

np.float64(-1.2059032158234453)

In [64]:
# Selection by position
df.iloc[0]

A   -1.205903
B   -0.511511
C    0.651474
D   -2.123320
Name: 2025-08-20 00:00:00, dtype: float64

In [66]:
df.iloc[3:5,0:1]

Unnamed: 0,A
2025-08-23,0.242789
2025-08-24,0.053046


In [68]:
df.iloc[[1,2,4],[2,3]]

Unnamed: 0,C,D
2025-08-21,0.208373,-0.44805
2025-08-22,-1.308022,0.70864
2025-08-24,-0.959791,-1.181696


In [69]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2025-08-21,0.280346,-0.972048,0.208373,-0.44805
2025-08-22,0.638818,-0.222277,-1.308022,0.70864


In [70]:
# Boolean indexing
# Selecting rows where df.A is greater than 0
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2025-08-21,0.280346,-0.972048,0.208373,-0.44805
2025-08-22,0.638818,-0.222277,-1.308022,0.70864
2025-08-23,0.242789,0.758159,0.769651,0.492938
2025-08-24,0.053046,-1.54153,-0.959791,-1.181696
2025-08-25,2.399643,0.332095,-0.217245,0.863972


In [71]:
# Selecting values from a DataFrame where a boolean condtion is met
df[df > 0]

Unnamed: 0,A,B,C,D
2025-08-20,,,0.651474,
2025-08-21,0.280346,,0.208373,
2025-08-22,0.638818,,,0.70864
2025-08-23,0.242789,0.758159,0.769651,0.492938
2025-08-24,0.053046,,,
2025-08-25,2.399643,0.332095,,0.863972


In [73]:
# Using isin() method for filtering
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2025-08-20,-1.205903,-0.511511,0.651474,-2.12332,one
2025-08-21,0.280346,-0.972048,0.208373,-0.44805,one
2025-08-22,0.638818,-0.222277,-1.308022,0.70864,two
2025-08-23,0.242789,0.758159,0.769651,0.492938,three
2025-08-24,0.053046,-1.54153,-0.959791,-1.181696,four
2025-08-25,2.399643,0.332095,-0.217245,0.863972,three


In [76]:
df2[df2["E"].isin(["two","three"])]

Unnamed: 0,A,B,C,D,E
2025-08-22,0.638818,-0.222277,-1.308022,0.70864,two
2025-08-23,0.242789,0.758159,0.769651,0.492938,three
2025-08-25,2.399643,0.332095,-0.217245,0.863972,three


In [80]:
# Setting
# setting a new column automatically aligns the data by the indexes
s1=pd.Series([1,2,3,4,5,6], index=pd.date_range("20250820", periods=6))
df["F"]=s1
df

Unnamed: 0,A,B,C,D,F
2025-08-20,-1.205903,-0.511511,0.651474,-2.12332,1
2025-08-21,0.280346,-0.972048,0.208373,-0.44805,2
2025-08-22,0.638818,-0.222277,-1.308022,0.70864,3
2025-08-23,0.242789,0.758159,0.769651,0.492938,4
2025-08-24,0.053046,-1.54153,-0.959791,-1.181696,5
2025-08-25,2.399643,0.332095,-0.217245,0.863972,6


In [84]:
# setting values by label
df.at[dates[0], "A"] = 0


In [85]:
# setting values by position
df.iloc[0,1] = 2