In [2]:
import numpy as np
import pandas as pd

# Creating a Series by passing a list of values, letting pandas create a default

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [15]:
dates = pd.date_range("20240101", periods=6)
dates

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2024-01-01,-0.457591,1.279257,0.791115,-0.100397
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526
2024-01-06,-0.494067,-0.03795,1.305826,0.248513


# Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [13]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [14]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype        
---  ------  --------------  -----        
 0   A       4 non-null      float64      
 1   B       4 non-null      datetime64[s]
 2   C       4 non-null      float32      
 3   D       4 non-null      int32        
 4   E       4 non-null      category     
 5   F       4 non-null      object       
dtypes: category(1), datetime64[s](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


# Viewing data

In [18]:
df.head()

Unnamed: 0,A,B,C,D
2024-01-01,-0.457591,1.279257,0.791115,-0.100397
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526


In [19]:
df.tail()

Unnamed: 0,A,B,C,D
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526
2024-01-06,-0.494067,-0.03795,1.305826,0.248513


In [20]:
df.index

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df2.index

Index([0, 1, 2, 3], dtype='int64')

In [22]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

# Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:

In [24]:
df

Unnamed: 0,A,B,C,D
2024-01-01,-0.457591,1.279257,0.791115,-0.100397
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526
2024-01-06,-0.494067,-0.03795,1.305826,0.248513


In [23]:
df.to_numpy()

array([[-0.4575911 ,  1.27925749,  0.79111465, -0.10039733],
       [-0.30183731,  0.14804498,  1.04645664, -0.71131704],
       [ 0.54355805,  0.96247915,  2.33962796,  0.94435925],
       [-0.73686407,  0.84834075,  0.55952513, -0.922302  ],
       [ 0.18371836,  0.87751227, -0.55689707,  0.53552648],
       [-0.4940665 , -0.03795031,  1.30582583,  0.2485133 ]])

# NumPy arrays have one dtype for the entire array while pandas DataFrames have one dtype per column. 

In [25]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [26]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

# describe() shows a quick statistic summary of your data:

In [27]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.210514,0.679614,0.914276,-0.000936
std,0.479766,0.510819,0.949503,0.722087
min,-0.736864,-0.03795,-0.556897,-0.922302
25%,-0.484948,0.323119,0.617423,-0.558587
50%,-0.379714,0.862927,0.918786,0.074058
75%,0.062329,0.941237,1.240984,0.463773
max,0.543558,1.279257,2.339628,0.944359


# Transposing your data:

In [31]:
df

Unnamed: 0,A,B,C,D
2024-01-01,-0.457591,1.279257,0.791115,-0.100397
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526
2024-01-06,-0.494067,-0.03795,1.305826,0.248513


In [28]:
df.T

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05,2024-01-06
A,-0.457591,-0.301837,0.543558,-0.736864,0.183718,-0.494067
B,1.279257,0.148045,0.962479,0.848341,0.877512,-0.03795
C,0.791115,1.046457,2.339628,0.559525,-0.556897,1.305826
D,-0.100397,-0.711317,0.944359,-0.922302,0.535526,0.248513


# DataFrame.sort_index() sorts by an axis:

In [32]:
df

Unnamed: 0,A,B,C,D
2024-01-01,-0.457591,1.279257,0.791115,-0.100397
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526
2024-01-06,-0.494067,-0.03795,1.305826,0.248513


In [33]:
df.sort_index(axis=1, ascending=False)
# axis= 1 ka mtlab hai k columns change hon gy 
# axis=0 ka mtlab hai k rows change hon gy

Unnamed: 0,D,C,B,A
2024-01-01,-0.100397,0.791115,1.279257,-0.457591
2024-01-02,-0.711317,1.046457,0.148045,-0.301837
2024-01-03,0.944359,2.339628,0.962479,0.543558
2024-01-04,-0.922302,0.559525,0.848341,-0.736864
2024-01-05,0.535526,-0.556897,0.877512,0.183718
2024-01-06,0.248513,1.305826,-0.03795,-0.494067


# DataFrame.sort_values() sorts by values:

In [38]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2024-01-06,-0.494067,-0.03795,1.305826,0.248513
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-01,-0.457591,1.279257,0.791115,-0.100397


# Get item ([])

In [40]:
df

Unnamed: 0,A,B,C,D
2024-01-01,-0.457591,1.279257,0.791115,-0.100397
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302
2024-01-05,0.183718,0.877512,-0.556897,0.535526
2024-01-06,-0.494067,-0.03795,1.305826,0.248513


In [43]:
df["A"]

2024-01-01   -0.457591
2024-01-02   -0.301837
2024-01-03    0.543558
2024-01-04   -0.736864
2024-01-05    0.183718
2024-01-06   -0.494067
Freq: D, Name: A, dtype: float64

In [41]:
df[["A", "B"]]

Unnamed: 0,A,B
2024-01-01,-0.457591,1.279257
2024-01-02,-0.301837,0.148045
2024-01-03,0.543558,0.962479
2024-01-04,-0.736864,0.848341
2024-01-05,0.183718,0.877512
2024-01-06,-0.494067,-0.03795


# For a DataFrame, passing a slice : selects matching rows:

In [44]:
df[0:3]

Unnamed: 0,A,B,C,D
2024-01-01,-0.457591,1.279257,0.791115,-0.100397
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359


In [45]:
df["20240102":"20240104"]

Unnamed: 0,A,B,C,D
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,-0.736864,0.848341,0.559525,-0.922302


# When we select both rows and columns then use Dataframe.iloc(rows_range,columns_range)

In [46]:
df.iloc[0:3,0:3]

Unnamed: 0,A,B,C
2024-01-01,-0.457591,1.279257,0.791115
2024-01-02,-0.301837,0.148045,1.046457
2024-01-03,0.543558,0.962479,2.339628


In [48]:
df.iloc[:,0:3]
# sb rows a jay ge

Unnamed: 0,A,B,C
2024-01-01,-0.457591,1.279257,0.791115
2024-01-02,-0.301837,0.148045,1.046457
2024-01-03,0.543558,0.962479,2.339628
2024-01-04,-0.736864,0.848341,0.559525
2024-01-05,0.183718,0.877512,-0.556897
2024-01-06,-0.494067,-0.03795,1.305826


# Selecting all rows (:) with a select column labels:

In [51]:
df.loc[:,["B","C"]]
# apni marzi k columns leny k liay

Unnamed: 0,B,C
2024-01-01,1.279257,0.791115
2024-01-02,0.148045,1.046457
2024-01-03,0.962479,2.339628
2024-01-04,0.848341,0.559525
2024-01-05,0.877512,-0.556897
2024-01-06,-0.03795,1.305826


# Selection by label

In [47]:
df.loc[dates[0]]

A   -0.457591
B    1.279257
C    0.791115
D   -0.100397
Name: 2024-01-01 00:00:00, dtype: float64

In [54]:
df.iloc[3]
# 3rd index wali rows k value aye ge

A   -0.736864
B    0.848341
C    0.559525
D   -0.922302
Name: 2024-01-04 00:00:00, dtype: float64

# For label slicing, both endpoints are included:

In [61]:
df.loc["20240102":"20240104", ["A", "B"]]

Unnamed: 0,A,B
2024-01-02,-0.301837,0.148045
2024-01-03,0.543558,0.962479
2024-01-04,-0.736864,0.848341


# Selecting a single row and column label returns a scalar:

In [62]:
df.loc[dates[0], "A"]

np.float64(-0.4575911016388059)

# For getting fast access to a scalar (equivalent to the prior method):

In [63]:
df.at[dates[0], "A"]

np.float64(-0.4575911016388059)

# Selection by position

In [64]:
# Select via the position of the passed integers:
df.iloc[3]

A   -0.736864
B    0.848341
C    0.559525
D   -0.922302
Name: 2024-01-04 00:00:00, dtype: float64

# Integer slices acts similar to NumPy/Python:

In [65]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2024-01-04,-0.736864,0.848341
2024-01-05,0.183718,0.877512


# Lists of integer position locations:

In [66]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2024-01-02,-0.301837,1.046457
2024-01-03,0.543558,2.339628
2024-01-05,0.183718,-0.556897


# For slicing rows explicitly:

In [67]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2024-01-02,-0.301837,0.148045,1.046457,-0.711317
2024-01-03,0.543558,0.962479,2.339628,0.944359


# For slicing columns explicitly:

In [68]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2024-01-01,1.279257,0.791115
2024-01-02,0.148045,1.046457
2024-01-03,0.962479,2.339628
2024-01-04,0.848341,0.559525
2024-01-05,0.877512,-0.556897
2024-01-06,-0.03795,1.305826


# For getting a value explicitly:

In [69]:
df.iloc[1, 1]

np.float64(0.1480449822635994)

# For getting fast access to a scalar (equivalent to the prior method):

In [70]:
df.iat[1, 1]

np.float64(0.1480449822635994)

# Select rows where df.A is greater than 0.

In [71]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-05,0.183718,0.877512,-0.556897,0.535526


# Selecting values from a DataFrame where a boolean condition is met:

In [72]:
df[df > 0]

Unnamed: 0,A,B,C,D
2024-01-01,,1.279257,0.791115,
2024-01-02,,0.148045,1.046457,
2024-01-03,0.543558,0.962479,2.339628,0.944359
2024-01-04,,0.848341,0.559525,
2024-01-05,0.183718,0.877512,,0.535526
2024-01-06,,,1.305826,0.248513


# Using isin() method for filtering:

In [73]:
df2 = df.copy()

In [74]:
df2["E"] = ["one", "one", "two", "three", "four", "three"]

In [75]:
df2

Unnamed: 0,A,B,C,D,E
2024-01-01,-0.457591,1.279257,0.791115,-0.100397,one
2024-01-02,-0.301837,0.148045,1.046457,-0.711317,one
2024-01-03,0.543558,0.962479,2.339628,0.944359,two
2024-01-04,-0.736864,0.848341,0.559525,-0.922302,three
2024-01-05,0.183718,0.877512,-0.556897,0.535526,four
2024-01-06,-0.494067,-0.03795,1.305826,0.248513,three


In [76]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2024-01-03,0.543558,0.962479,2.339628,0.944359,two
2024-01-05,0.183718,0.877512,-0.556897,0.535526,four
