## This notebook is all about pandas library

In [1]:
# importing libraries
import pandas as pd
import numpy as np

- we can create object series

In [2]:
s = pd.Series([1, 3, np.nan, 5, 6, 7, 8])
s

0    1.0
1    3.0
2    NaN
3    5.0
4    6.0
5    7.0
6    8.0
dtype: float64

- we can also create date series

In [3]:
dates = pd.date_range("20230101", periods=17)
dates

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12',
               '2023-01-13', '2023-01-14', '2023-01-15', '2023-01-16',
               '2023-01-17'],
              dtype='datetime64[ns]', freq='D')

 - we can create dataframe from the dates series provided above

In [4]:
df = pd.DataFrame(np.random.randn(17, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2023-01-01,0.135794,-0.245075,0.268088,1.914399
2023-01-02,0.0177,0.983685,0.989183,0.369986
2023-01-03,0.285547,-0.253261,1.067338,1.597522
2023-01-04,0.138256,-0.348066,-0.710089,-0.647587
2023-01-05,-0.427182,0.022962,0.135902,-0.057623
2023-01-06,-2.373313,-0.442792,-0.730618,-0.323181
2023-01-07,-1.265459,0.22885,-0.250315,-0.263162
2023-01-08,-0.695152,1.46351,-0.838501,0.132633
2023-01-09,0.877663,-0.089026,0.478956,0.121681
2023-01-10,-0.865017,-0.671494,1.488175,-1.52206


 - we have created a dataframe using dictionary data structure

In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20230101"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-01-01,1.0,3,test,foo
1,1.0,2023-01-01,1.0,3,train,foo
2,1.0,2023-01-01,1.0,3,test,foo
3,1.0,2023-01-01,1.0,3,train,foo


- this is how we can check datatypes of a dataframe

In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

- head() method will show starting 5 rows

In [7]:
df.head()

Unnamed: 0,A,B,C,D
2023-01-01,0.135794,-0.245075,0.268088,1.914399
2023-01-02,0.0177,0.983685,0.989183,0.369986
2023-01-03,0.285547,-0.253261,1.067338,1.597522
2023-01-04,0.138256,-0.348066,-0.710089,-0.647587
2023-01-05,-0.427182,0.022962,0.135902,-0.057623


- we can also specify ammount of rows we want to see

In [8]:
df.head(3)

Unnamed: 0,A,B,C,D
2023-01-01,0.135794,-0.245075,0.268088,1.914399
2023-01-02,0.0177,0.983685,0.989183,0.369986
2023-01-03,0.285547,-0.253261,1.067338,1.597522


- tail() method will show us the bottom 5 rows and to see specific rows we can pass the ammount as argument 

In [9]:
df.tail()

Unnamed: 0,A,B,C,D
2023-01-13,1.32407,0.142533,-0.012777,0.115649
2023-01-14,-1.258389,0.095269,0.227353,-0.651294
2023-01-15,2.011899,1.571684,0.491421,-0.73992
2023-01-16,0.260325,-0.086345,1.228034,1.382564
2023-01-17,-2.082148,1.599879,1.174911,1.007868


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2023-01-15,2.011899,1.571684,0.491421,-0.73992
2023-01-16,0.260325,-0.086345,1.228034,1.382564
2023-01-17,-2.082148,1.599879,1.174911,1.007868


- we can also check for index using index variable

In [11]:
print(df.index)
print(df2.index)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12',
               '2023-01-13', '2023-01-14', '2023-01-15', '2023-01-16',
               '2023-01-17'],
              dtype='datetime64[ns]', freq='D')
Int64Index([0, 1, 2, 3], dtype='int64')


- to convert our dataframe to array we can use to_numpy() method

In [12]:
array = df2.to_numpy()
array

array([[1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

- we can check type using type() method

In [13]:
type(array)

numpy.ndarray

- we can check summary of or dataframe
  - using describe() method

In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,17.0,17.0,17.0,17.0
mean,-0.270974,0.038749,0.204972,0.201939
std,1.127029,1.036421,0.82547,0.925481
min,-2.373313,-2.795437,-1.40154,-1.52206
25%,-0.865017,-0.348066,-0.250315,-0.323181
50%,-0.285114,-0.086345,0.227353,0.115649
75%,0.260325,0.22885,0.989183,1.007868
max,2.011899,1.599879,1.488175,1.914399


- to transpose data
  - use T variable to transpose it

In [15]:
df2.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2023-01-01 00:00:00,2023-01-01 00:00:00,2023-01-01 00:00:00,2023-01-01 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


- we can sort on the basis of index
  - using sort_index() method

In [16]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2023-01-17,-2.082148,1.599879,1.174911,1.007868
2023-01-16,0.260325,-0.086345,1.228034,1.382564
2023-01-15,2.011899,1.571684,0.491421,-0.73992
2023-01-14,-1.258389,0.095269,0.227353,-0.651294
2023-01-13,1.32407,0.142533,-0.012777,0.115649
2023-01-12,-0.285114,-0.518139,-0.120991,1.08525
2023-01-11,-0.406034,-2.795437,-1.40154,-0.089764
2023-01-10,-0.865017,-0.671494,1.488175,-1.52206
2023-01-09,0.877663,-0.089026,0.478956,0.121681
2023-01-08,-0.695152,1.46351,-0.838501,0.132633


- we can also use sort_values() method to sort on basis of particular row

In [17]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2023-01-17,-2.082148,1.599879,1.174911,1.007868
2023-01-15,2.011899,1.571684,0.491421,-0.73992
2023-01-08,-0.695152,1.46351,-0.838501,0.132633
2023-01-02,0.0177,0.983685,0.989183,0.369986
2023-01-07,-1.265459,0.22885,-0.250315,-0.263162
2023-01-13,1.32407,0.142533,-0.012777,0.115649
2023-01-14,-1.258389,0.095269,0.227353,-0.651294
2023-01-05,-0.427182,0.022962,0.135902,-0.057623
2023-01-16,0.260325,-0.086345,1.228034,1.382564
2023-01-09,0.877663,-0.089026,0.478956,0.121681


In [18]:
df["B"]

2023-01-01   -0.245075
2023-01-02    0.983685
2023-01-03   -0.253261
2023-01-04   -0.348066
2023-01-05    0.022962
2023-01-06   -0.442792
2023-01-07    0.228850
2023-01-08    1.463510
2023-01-09   -0.089026
2023-01-10   -0.671494
2023-01-11   -2.795437
2023-01-12   -0.518139
2023-01-13    0.142533
2023-01-14    0.095269
2023-01-15    1.571684
2023-01-16   -0.086345
2023-01-17    1.599879
Freq: D, Name: B, dtype: float64

In [19]:
# row wise selection
df[0:3]

Unnamed: 0,A,B,C,D
2023-01-01,0.135794,-0.245075,0.268088,1.914399
2023-01-02,0.0177,0.983685,0.989183,0.369986
2023-01-03,0.285547,-0.253261,1.067338,1.597522


In [20]:
df.loc[dates[2]]

A    0.285547
B   -0.253261
C    1.067338
D    1.597522
Name: 2023-01-03 00:00:00, dtype: float64

In [21]:
df.loc[:, ["C", "A"]]

Unnamed: 0,C,A
2023-01-01,0.268088,0.135794
2023-01-02,0.989183,0.0177
2023-01-03,1.067338,0.285547
2023-01-04,-0.710089,0.138256
2023-01-05,0.135902,-0.427182
2023-01-06,-0.730618,-2.373313
2023-01-07,-0.250315,-1.265459
2023-01-08,-0.838501,-0.695152
2023-01-09,0.478956,0.877663
2023-01-10,1.488175,-0.865017


- the below code will display us 2023-01-04 to 2023-01-08 data with column A and B values

In [22]:
df.loc["20230104":"20230108", ["A","B"]]

Unnamed: 0,A,B
2023-01-04,0.138256,-0.348066
2023-01-05,-0.427182,0.022962
2023-01-06,-2.373313,-0.442792
2023-01-07,-1.265459,0.22885
2023-01-08,-0.695152,1.46351


- the below code will display us 2023-01-04 and 2023-01-08 data with column A and B values

In [23]:
df.loc[["20230104","20230108"], ["A","B"]]

Unnamed: 0,A,B
2023-01-04,0.138256,-0.348066
2023-01-08,-0.695152,1.46351


In [24]:
df.at[dates[0], "A"]

0.13579385570793656

In [25]:
df[df["C"] > 1.2]

Unnamed: 0,A,B,C,D
2023-01-10,-0.865017,-0.671494,1.488175,-1.52206
2023-01-16,0.260325,-0.086345,1.228034,1.382564


In [26]:
[df[["A", "C"]] > 1.2]

[                A      C
 2023-01-01  False  False
 2023-01-02  False  False
 2023-01-03  False  False
 2023-01-04  False  False
 2023-01-05  False  False
 2023-01-06  False  False
 2023-01-07  False  False
 2023-01-08  False  False
 2023-01-09  False  False
 2023-01-10  False   True
 2023-01-11  False  False
 2023-01-12  False  False
 2023-01-13   True  False
 2023-01-14  False  False
 2023-01-15   True  False
 2023-01-16  False   True
 2023-01-17  False  False]

- to copy a dataframe
  - use copy() method

In [27]:
copy_df = df.copy()

- we can add an aditional column by the way down below

In [28]:
copy_df["E"] = ["one", "one", "two", "three", "four", "three","one", "one", "two", "three", "four", "three","one", "one", "two", "three", "four"]
copy_df

Unnamed: 0,A,B,C,D,E
2023-01-01,0.135794,-0.245075,0.268088,1.914399,one
2023-01-02,0.0177,0.983685,0.989183,0.369986,one
2023-01-03,0.285547,-0.253261,1.067338,1.597522,two
2023-01-04,0.138256,-0.348066,-0.710089,-0.647587,three
2023-01-05,-0.427182,0.022962,0.135902,-0.057623,four
2023-01-06,-2.373313,-0.442792,-0.730618,-0.323181,three
2023-01-07,-1.265459,0.22885,-0.250315,-0.263162,one
2023-01-08,-0.695152,1.46351,-0.838501,0.132633,one
2023-01-09,0.877663,-0.089026,0.478956,0.121681,two
2023-01-10,-0.865017,-0.671494,1.488175,-1.52206,three


In [29]:
df4 = df[["A", "B", "C", "D"]].mean()
df4

A   -0.270974
B    0.038749
C    0.204972
D    0.201939
dtype: float64

- below is the way you can actually find the mean and add that to column

In [38]:
me =[]
for a in range(0, 17):
    new  = df.loc[dates[a]].mean()
    me.append(new)

copy_df["mean"] = me
copy_df

Unnamed: 0,A,B,C,D,E,mean
2023-01-01,0.135794,-0.245075,0.268088,1.914399,one,0.518302
2023-01-02,0.0177,0.983685,0.989183,0.369986,one,0.590139
2023-01-03,0.285547,-0.253261,1.067338,1.597522,two,0.674287
2023-01-04,0.138256,-0.348066,-0.710089,-0.647587,three,-0.391871
2023-01-05,-0.427182,0.022962,0.135902,-0.057623,four,-0.081485
2023-01-06,-2.373313,-0.442792,-0.730618,-0.323181,three,-0.967476
2023-01-07,-1.265459,0.22885,-0.250315,-0.263162,one,-0.387521
2023-01-08,-0.695152,1.46351,-0.838501,0.132633,one,0.015622
2023-01-09,0.877663,-0.089026,0.478956,0.121681,two,0.347318
2023-01-10,-0.865017,-0.671494,1.488175,-1.52206,three,-0.392599


In [39]:
del copy_df["E"]

In [40]:
copy_df

Unnamed: 0,A,B,C,D,mean
2023-01-01,0.135794,-0.245075,0.268088,1.914399,0.518302
2023-01-02,0.0177,0.983685,0.989183,0.369986,0.590139
2023-01-03,0.285547,-0.253261,1.067338,1.597522,0.674287
2023-01-04,0.138256,-0.348066,-0.710089,-0.647587,-0.391871
2023-01-05,-0.427182,0.022962,0.135902,-0.057623,-0.081485
2023-01-06,-2.373313,-0.442792,-0.730618,-0.323181,-0.967476
2023-01-07,-1.265459,0.22885,-0.250315,-0.263162,-0.387521
2023-01-08,-0.695152,1.46351,-0.838501,0.132633,0.015622
2023-01-09,0.877663,-0.089026,0.478956,0.121681,0.347318
2023-01-10,-0.865017,-0.671494,1.488175,-1.52206,-0.392599
