In [6]:
# tutorial link: https://pandas.pydata.org/docs/user_guide/10min.html
import numpy as np
import pandas as pd

In [7]:
# there are two types of data structures in pandas
# two classes handle those
# Series for 1D data
# DataFrame for 2D data
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [8]:
# creating dataframe by passing a NumPy array with datetime index using date_range() and labeled colums
dates = pd.date_range("20250820", periods=6)
print(dates)

DatetimeIndex(['2025-08-20', '2025-08-21', '2025-08-22', '2025-08-23',
               '2025-08-24', '2025-08-25'],
              dtype='datetime64[ns]', freq='D')


In [9]:
# makeing this a dataframe
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
print(df)

                   A         B         C         D
2025-08-20 -1.238847  1.001248  1.042154  1.156370
2025-08-21  1.047504 -0.247127 -1.294779 -0.161616
2025-08-22 -1.172611 -0.171024  1.455082  1.254268
2025-08-23 -1.182253 -0.206135 -0.973165  1.484013
2025-08-24 -2.117772 -0.723084 -2.158338  0.892467
2025-08-25  0.166884  0.019090  0.361152 -1.884947


In [10]:
# creating a DataFrame by passing a dictionary of objects where the keys are colum lebels and the values are the column values
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20250820"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3]*4, dtype="int32"),
        "Copy": np.array(["bla"]*4, dtype="str"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
        "G": 3
    }
)
print(df2)

     A          B    C  D Copy      E    F  G
0  1.0 2025-08-20  1.0  3  bla   test  foo  3
1  1.0 2025-08-20  1.0  3  bla  train  foo  3
2  1.0 2025-08-20  1.0  3  bla   test  foo  3
3  1.0 2025-08-20  1.0  3  bla  train  foo  3


In [11]:
# column of the DataFrame has different datatypes
print(df2.dtypes)

A             float64
B       datetime64[s]
C             float32
D               int32
Copy           object
E            category
F              object
G               int64
dtype: object


In [12]:
# Viewing data
# Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame
print(df.head())
print(df.tail(3))

                   A         B         C         D
2025-08-20 -1.238847  1.001248  1.042154  1.156370
2025-08-21  1.047504 -0.247127 -1.294779 -0.161616
2025-08-22 -1.172611 -0.171024  1.455082  1.254268
2025-08-23 -1.182253 -0.206135 -0.973165  1.484013
2025-08-24 -2.117772 -0.723084 -2.158338  0.892467
                   A         B         C         D
2025-08-23 -1.182253 -0.206135 -0.973165  1.484013
2025-08-24 -2.117772 -0.723084 -2.158338  0.892467
2025-08-25  0.166884  0.019090  0.361152 -1.884947


In [13]:
# Display DataFrame.index or DataFrame.columns
print(df.index)
print(df.columns)

DatetimeIndex(['2025-08-20', '2025-08-21', '2025-08-22', '2025-08-23',
               '2025-08-24', '2025-08-25'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [14]:
# return to numpy representation of the underlying data with DataFrame.to_numpy() without the index or column labels
numpy_arr = df.to_numpy()
print(numpy_arr)

[[-1.23884727  1.00124791  1.04215356  1.15637   ]
 [ 1.04750388 -0.24712725 -1.29477879 -0.16161627]
 [-1.17261083 -0.17102449  1.45508165  1.25426762]
 [-1.18225279 -0.20613548 -0.97316487  1.48401285]
 [-2.11777244 -0.72308379 -2.15833781  0.8924671 ]
 [ 0.16688415  0.01909007  0.36115157 -1.88494676]]


In [15]:
# NumPy arrays have one dtype for the entire array while pandas DataFrames have one dtype per column.
# When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. 
# If the common data type is object, DataFrame.to_numpy() will require copying data.

# for example df2 holds different data types
numpy_arr_from_df2 = df2.to_numpy()
print(numpy_arr_from_df2)
df2.to_numpy() # notice data type is object

[[1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'test' 'foo' 3]
 [1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'train' 'foo' 3]
 [1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'test' 'foo' 3]
 [1.0 Timestamp('2025-08-20 00:00:00') 1.0 3 'bla' 'train' 'foo' 3]]


array([[1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'test',
        'foo', 3],
       [1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'train',
        'foo', 3],
       [1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'test',
        'foo', 3],
       [1.0, Timestamp('2025-08-20 00:00:00'), 1.0, 3, 'bla', 'train',
        'foo', 3]], dtype=object)

In [16]:
# DataFrame.describe() shows quick statistic summary of data
print(df.describe())

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.749516 -0.054506 -0.261316  0.456759
std    1.144282  0.572835  1.428705  1.283475
min   -2.117772 -0.723084 -2.158338 -1.884947
25%   -1.224699 -0.236879 -1.214375  0.101905
50%   -1.177432 -0.188580 -0.306007  1.024419
75%   -0.167990 -0.028439  0.871903  1.229793
max    1.047504  1.001248  1.455082  1.484013


In [17]:
# Transposing data
df.T

Unnamed: 0,2025-08-20,2025-08-21,2025-08-22,2025-08-23,2025-08-24,2025-08-25
A,-1.238847,1.047504,-1.172611,-1.182253,-2.117772,0.166884
B,1.001248,-0.247127,-0.171024,-0.206135,-0.723084,0.01909
C,1.042154,-1.294779,1.455082,-0.973165,-2.158338,0.361152
D,1.15637,-0.161616,1.254268,1.484013,0.892467,-1.884947


In [18]:
# DataFrame.sort_index() shorts by an axis
df.sort_index(axis=0, ascending=True) # axis = 0 represents row and 1 represent column

Unnamed: 0,A,B,C,D
2025-08-20,-1.238847,1.001248,1.042154,1.15637
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616
2025-08-22,-1.172611,-0.171024,1.455082,1.254268
2025-08-23,-1.182253,-0.206135,-0.973165,1.484013
2025-08-24,-2.117772,-0.723084,-2.158338,0.892467
2025-08-25,0.166884,0.01909,0.361152,-1.884947


In [19]:
# DataFrame.sort_values() sorts by values:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2025-08-20,-1.238847,1.001248,1.042154,1.15637
2025-08-25,0.166884,0.01909,0.361152,-1.884947
2025-08-22,-1.172611,-0.171024,1.455082,1.254268
2025-08-23,-1.182253,-0.206135,-0.973165,1.484013
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616
2025-08-24,-2.117772,-0.723084,-2.158338,0.892467


In [20]:
# Getitem ([])
# For a DataFrame, passing a single lebel selects a coluns and yields a Series equivalent to df.A
df["A"]

2025-08-20   -1.238847
2025-08-21    1.047504
2025-08-22   -1.172611
2025-08-23   -1.182253
2025-08-24   -2.117772
2025-08-25    0.166884
Freq: D, Name: A, dtype: float64

In [21]:
# For a DataFrame, passing a slice : selects matching rows
df[1:3]

Unnamed: 0,A,B,C,D
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616
2025-08-22,-1.172611,-0.171024,1.455082,1.254268


In [22]:
df["20250820":"20250822"]

Unnamed: 0,A,B,C,D
2025-08-20,-1.238847,1.001248,1.042154,1.15637
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616
2025-08-22,-1.172611,-0.171024,1.455082,1.254268


In [23]:
# Selection by label
# Selecting a row matching a label
df.loc[dates[0]]

A   -1.238847
B    1.001248
C    1.042154
D    1.156370
Name: 2025-08-20 00:00:00, dtype: float64

In [24]:
# Selecting all rows (:) with a select column labels
df.loc[:,["A", "C"]]

Unnamed: 0,A,C
2025-08-20,-1.238847,1.042154
2025-08-21,1.047504,-1.294779
2025-08-22,-1.172611,1.455082
2025-08-23,-1.182253,-0.973165
2025-08-24,-2.117772,-2.158338
2025-08-25,0.166884,0.361152


In [25]:
# selecting both labels
df.loc["20250820":"20250822", ["A", "B"]]

Unnamed: 0,A,B
2025-08-20,-1.238847,1.001248
2025-08-21,1.047504,-0.247127
2025-08-22,-1.172611,-0.171024


In [26]:
# Selecting a single row and column returns a scalar
df.loc[dates[0],"A"]

np.float64(-1.2388472744080081)

In [27]:
# for getting fast access to a scalar
df.at[dates[0],"A"]

np.float64(-1.2388472744080081)

In [28]:
# Selection by position
df.iloc[0]

A   -1.238847
B    1.001248
C    1.042154
D    1.156370
Name: 2025-08-20 00:00:00, dtype: float64

In [29]:
df.iloc[3:5,0:1]

Unnamed: 0,A
2025-08-23,-1.182253
2025-08-24,-2.117772


In [30]:
df.iloc[[1,2,4],[2,3]]

Unnamed: 0,C,D
2025-08-21,-1.294779,-0.161616
2025-08-22,1.455082,1.254268
2025-08-24,-2.158338,0.892467


In [31]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616
2025-08-22,-1.172611,-0.171024,1.455082,1.254268


In [32]:
# Boolean indexing
# Selecting rows where df.A is greater than 0
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616
2025-08-25,0.166884,0.01909,0.361152,-1.884947


In [33]:
# Selecting values from a DataFrame where a boolean condtion is met
df[df > 0]

Unnamed: 0,A,B,C,D
2025-08-20,,1.001248,1.042154,1.15637
2025-08-21,1.047504,,,
2025-08-22,,,1.455082,1.254268
2025-08-23,,,,1.484013
2025-08-24,,,,0.892467
2025-08-25,0.166884,0.01909,0.361152,


In [34]:
# Using isin() method for filtering
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2025-08-20,-1.238847,1.001248,1.042154,1.15637,one
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616,one
2025-08-22,-1.172611,-0.171024,1.455082,1.254268,two
2025-08-23,-1.182253,-0.206135,-0.973165,1.484013,three
2025-08-24,-2.117772,-0.723084,-2.158338,0.892467,four
2025-08-25,0.166884,0.01909,0.361152,-1.884947,three


In [35]:
df2[df2["E"].isin(["two","three"])]

Unnamed: 0,A,B,C,D,E
2025-08-22,-1.172611,-0.171024,1.455082,1.254268,two
2025-08-23,-1.182253,-0.206135,-0.973165,1.484013,three
2025-08-25,0.166884,0.01909,0.361152,-1.884947,three


In [36]:
# Setting
# setting a new column automatically aligns the data by the indexes
s1=pd.Series([1,2,3,4,5,6], index=pd.date_range("20250820", periods=6))
df["F"]=s1
df

Unnamed: 0,A,B,C,D,F
2025-08-20,-1.238847,1.001248,1.042154,1.15637,1
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616,2
2025-08-22,-1.172611,-0.171024,1.455082,1.254268,3
2025-08-23,-1.182253,-0.206135,-0.973165,1.484013,4
2025-08-24,-2.117772,-0.723084,-2.158338,0.892467,5
2025-08-25,0.166884,0.01909,0.361152,-1.884947,6


In [37]:
# setting values by label
df.at[dates[0], "A"] = 0


In [38]:
# setting values by position
df.iloc[0,1] = 2

In [50]:
# Setting by assigning with a NumPy arrya
import numpy as np
df["G"] = np.array(np.random.randn(len(df)))

In [54]:
# Missing data
# Reindex allows to change/add/delete on a specific axis
df1 = df.reindex(index=dates[:],columns=list(df.columns)+["H"])
df1

Unnamed: 0,A,B,C,D,F,G,H
2025-08-20,0.0,2.0,1.042154,1.15637,1,-2.222669,
2025-08-21,1.047504,-0.247127,-1.294779,-0.161616,2,0.053927,
2025-08-22,-1.172611,-0.171024,1.455082,1.254268,3,1.116849,
2025-08-23,-1.182253,-0.206135,-0.973165,1.484013,4,-1.524247,
2025-08-24,-2.117772,-0.723084,-2.158338,0.892467,5,-1.030409,
2025-08-25,0.166884,0.01909,0.361152,-1.884947,6,0.269081,


In [65]:
# Adding a new column puts values to np.nan
df1_np = df1.loc[:,["H"]].to_numpy()
df1_np

array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan]])

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (67846798.py, line 1)