In [1]:
import numpy as np

import pandas as pd

# Series / DataFrame creation

In [2]:
pd.Series([1, 3, 5, np.nan, 6, 8])

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randint(0,16,size=(6,4)), index=dates, columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2013-01-01,1,3,1,3
2013-01-02,2,10,9,10
2013-01-03,13,15,10,14
2013-01-04,4,6,12,2
2013-01-05,2,14,4,4
2013-01-06,9,8,7,7


In [15]:

df =pd.DataFrame ({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
})
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [16]:
df.dtypes

Name    object
Age      int64
City    object
dtype: object

In [17]:
df.head(2)

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles


In [18]:
df.tail(2)

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston


In [19]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [20]:
df.columns

Index(['Name', 'Age', 'City'], dtype='object')

In [21]:
df.to_numpy()

array([['Alice', 25, 'New York'],
       ['Bob', 30, 'Los Angeles'],
       ['Charlie', 35, 'Chicago'],
       ['David', 40, 'Houston']], dtype=object)

In [22]:
df.describe()

Unnamed: 0,Age
count,4.0
mean,32.5
std,6.454972
min,25.0
25%,28.75
50%,32.5
75%,36.25
max,40.0


In [23]:
df.T

Unnamed: 0,0,1,2,3
Name,Alice,Bob,Charlie,David
Age,25,30,35,40
City,New York,Los Angeles,Chicago,Houston


In [26]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,Name,City,Age
0,Alice,New York,25
1,Bob,Los Angeles,30
2,Charlie,Chicago,35
3,David,Houston,40


In [27]:
df.sort_values(by="Age")

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


# Selection

In [30]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [29]:
df['Age'][1]

30

In [31]:
#selects matching rows
df[0:2]

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles


In [47]:
df.iloc[1,:]

Name            Bob
Age              30
City    Los Angeles
Name: 1, dtype: object

In [51]:
df.iloc[[1,2],2]

1    Los Angeles
2        Chicago
Name: City, dtype: object

In [50]:
df.loc[2,['Age','City']]

Age          35
City    Chicago
Name: 2, dtype: object

# Boolean indexing

In [52]:
df['Age']>30

0    False
1    False
2     True
3     True
Name: Age, dtype: bool

In [53]:
df[df['Age']>30]

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston


In [54]:
df[df.Age.isin([25,30])]

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles


# Missing data

In [56]:
df['Sex']=pd.Series(['f','m',pd.NA, pd.NA])

In [57]:
df

Unnamed: 0,Name,Age,City,Sex
0,Alice,25,New York,f
1,Bob,30,Los Angeles,m
2,Charlie,35,Chicago,
3,David,40,Houston,


In [60]:
df.dropna()

Unnamed: 0,Name,Age,City,Sex
0,Alice,25,New York,f
1,Bob,30,Los Angeles,m


In [62]:
df.fillna('unknown')

Unnamed: 0,Name,Age,City,Sex
0,Alice,25,New York,f
1,Bob,30,Los Angeles,m
2,Charlie,35,Chicago,unknown
3,David,40,Houston,unknown


In [63]:
pd.isna(df)

Unnamed: 0,Name,Age,City,Sex
0,False,False,False,False
1,False,False,False,False
2,False,False,False,True
3,False,False,False,True


# Operations


In [68]:
df['Score']=pd.Series([3,4,5,6])
df

Unnamed: 0,Name,Age,City,Sex,Score
0,Alice,25,New York,f,3
1,Bob,30,Los Angeles,m,4
2,Charlie,35,Chicago,,5
3,David,40,Houston,,6


In [69]:
df.Age[df.Score>4].mean()

37.5

# User defined functions

In [72]:
df.Score.agg(lambda x: np.sin(x)* 5.6)

0    0.790272
1   -4.238094
2   -5.369976
3   -1.564727
Name: Score, dtype: float64

In [73]:
df.Age.value_counts()

Age
25    1
30    1
35    1
40    1
Name: count, dtype: int64

# Grouping

In [74]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,0.145258,-0.157543
1,bar,one,0.160555,0.773723
2,foo,two,-0.048364,-1.380843
3,bar,three,-1.345363,-1.181569
4,foo,two,1.510227,0.710195
5,bar,two,0.013663,-0.084232
6,foo,one,-0.626369,0.062082
7,foo,three,-0.397056,1.064143


In [75]:
df.groupby("A")[["C", "D"]].sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.171145,-0.492078
foo,0.583696,0.298033


# Categoricals


In [77]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [81]:
df["grade"] = df["raw_grade"].astype("category")


new_categories = ["very good", "good", "very bad"]

df["grade"] = df["grade"].cat.rename_categories(new_categories)
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad
