In [2]:
import pandas as pd
import numpy as np

In [3]:
l = list('abcd')
l

['a', 'b', 'c', 'd']

In [5]:
idx = pd.Index([1,2,3,4])
idx

Index([1, 2, 3, 4], dtype='int64')

In [6]:
idx[0]

np.int64(1)

In [7]:
idx[1:4]

Index([2, 3, 4], dtype='int64')

In [9]:
idx[[0,3]]

Index([1, 4], dtype='int64')

In [10]:
idx[idx %4 ==0]

Index([4], dtype='int64')

In [11]:
print(type(idx))

<class 'pandas.core.indexes.base.Index'>


In [13]:
# index objects are immuatable
idx[0] = 10

TypeError: Index does not support mutable operations

In [16]:
idx_1 = pd.Index(['a', 'b', 'c'])
idx_2 = pd.Index(['c', 'd', 'e'])

idx_1.intersection(idx_2)

Index(['c'], dtype='object')

In [18]:
idx_1.union(idx_2)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [19]:
pd.Index(range(2,10))

RangeIndex(start=2, stop=10, step=1)

In [25]:
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
s

a    10
b    20
c    30
d    40
dtype: int64

In [21]:
s['a']

np.int64(10)

In [27]:
s['a':'c']  # slicing includes the endpoint

a    10
b    20
c    30
dtype: int64

In [28]:
s['a':'d':2]

a    10
c    30
dtype: int64

In [31]:
s, s.iloc[2], s.loc['a']

(a    10
 b    20
 c    30
 d    40
 dtype: int64,
 np.int64(30),
 np.int64(10))

In [32]:
s.index, s.values, s.items

(Index(['a', 'b', 'c', 'd'], dtype='object'),
 array([10, 20, 30, 40]),
 <bound method Series.items of a    10
 b    20
 c    30
 d    40
 dtype: int64>)

In [33]:
capitals = {
    "ID": "Delhi",
    "UK": "London",
    "IR": "Dublin",
    "FR": "Paris"
}

In [34]:
s = pd.Series(capitals)

In [35]:
s

ID     Delhi
UK    London
IR    Dublin
FR     Paris
dtype: object

In [36]:
s.index

Index(['ID', 'UK', 'IR', 'FR'], dtype='object')

In [37]:
s.values

array(['Delhi', 'London', 'Dublin', 'Paris'], dtype=object)

In [38]:
s.items()

<zip at 0x123787340>

In [39]:
list(s.items())

[('ID', 'Delhi'), ('UK', 'London'), ('IR', 'Dublin'), ('FR', 'Paris')]

In [40]:
areas = pd.Series(['US', 'UK', 'Lon', 'Syd','FR', 'Paris'], 
                  index=['Country', 'Country', 'City', 'City', 'Country', 'City',])

In [42]:
areas

Country       US
Country       UK
City         Lon
City         Syd
Country       FR
City       Paris
dtype: object

In [43]:
areas['City']

City      Lon
City      Syd
City    Paris
dtype: object

In [45]:
areas['Country']

Country    US
Country    UK
Country    FR
dtype: object

In [46]:
areas['City'] = 'London'

In [47]:
areas

Country        US
Country        UK
City       London
City       London
Country        FR
City       London
dtype: object

In [48]:
areas = pd.Series(['US', 'UK', 'Lon', 'Syd','FR', 'Paris'], 
                  index=['Country', 'Country', 'City', 'City', 'Country', 'City',])

In [49]:
areas

Country       US
Country       UK
City         Lon
City         Syd
Country       FR
City       Paris
dtype: object

In [54]:
areas.iloc[0] = 'USA'

In [55]:
areas

Country      USA
Country       UK
City         Lon
City         Syd
Country       FR
City       Paris
dtype: object

In [57]:
s = pd.Series([10,20,30], index=[1,2,3])
s

1    10
2    20
3    30
dtype: int64

In [58]:
s.iloc[2], s.loc[2]

(np.int64(30), np.int64(20))

In [59]:
areas = pd.Series(['US', 'UK', 'Lon', 'Syd','FR', 'Paris'], 
                  index=['Country', 'Country', 'City', 'City', 'Country', 'City',])
areas

Country       US
Country       UK
City         Lon
City         Syd
Country       FR
City       Paris
dtype: object

In [61]:
areas[areas=='US']

Country    US
dtype: object

In [62]:
s = pd.Series([10,20,30], index=list('abc'), name='test')
s

a    10
b    20
c    30
Name: test, dtype: int64

In [65]:
s.index

Index(['a', 'b', 'c'], dtype='object')

In [70]:
cols = pd.Index(['ID', 'UK', 'FR', 'GR', 'ES', 'US', 'CN'])

In [81]:
fullName = pd.Series(['India', 'United Kingdom', 'France', 'Germany', 'Spain', 'United Sates', 'China'], index=columns, name='name')

In [82]:
population = pd.Series([200, 20, 30, 25, 40, 100, 400], index=columns, name='pop')

In [83]:
area = pd.Series([10, 20, 30, 45, 50, 600, 700], index=columns, name='area')

In [84]:
data = pd.DataFrame([fullName,population,area])

In [85]:
data

Unnamed: 0,ID,UK,FR,GR,ES,US,CN
name,India,United Kingdom,France,Germany,Spain,United Sates,China
pop,200,20,30,25,40,100,400
area,10,20,30,45,50,600,700


In [86]:
data.transpose()

Unnamed: 0,name,pop,area
ID,India,200,10
UK,United Kingdom,20,20
FR,France,30,30
GR,Germany,25,45
ES,Spain,40,50
US,United Sates,100,600
CN,China,400,700


In [92]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, name to area
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3 non-null      object
 1   UK      3 non-null      object
 2   FR      3 non-null      object
 3   GR      3 non-null      object
 4   ES      3 non-null      object
 5   US      3 non-null      object
 6   CN      3 non-null      object
dtypes: object(7)
memory usage: 300.0+ bytes


In [94]:
counties = {
    'The Bronx': 'Bronx',
    'Brooklyn': 'Kings',
    'Manhattan': 'New York',
    'Queens': 'Queens',
    'Staten Island': 'Richmond'
}
populations = {
    # note how the keys are not necessarily in the same order
    'Manhattan': 1_628_706,
    'Queens': 2_253_858,
    'Staten Island': 476_143,
    'The Bronx': 1_418_207,
    'Brooklyn': 2_559_903
}
gdp = {
    'The Bronx': 42.695,
    'Brooklyn': 91.559,
    'Manhattan': 600.244,
    'Queens': 93.310,
    'Staten Island': 14.514
}
areas = {
    'The Bronx': 2.10,
    'Brooklyn': 70.82,
    'Manhattan': 22.83,
    'Queens': 108.53,
    'Staten Island': 58.37
}

d = {
    'county': counties,
    'population': populations,
    'gpd': gdp,
    'area': areas
}

new_york = pd.DataFrame(d)
new_york

Unnamed: 0,county,population,gpd,area
The Bronx,Bronx,1418207,42.695,2.1
Brooklyn,Kings,2559903,91.559,70.82
Manhattan,New York,1628706,600.244,22.83
Queens,Queens,2253858,93.31,108.53
Staten Island,Richmond,476143,14.514,58.37


In [95]:
new_york = pd.DataFrame([counties, populations, gdp, areas])
new_york

Unnamed: 0,The Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Bronx,Kings,New York,Queens,Richmond
1,1418207,2559903,1628706,2253858,476143
2,42.695,91.559,600.244,93.31,14.514
3,2.1,70.82,22.83,108.53,58.37


In [96]:
new_york.rename(index={0:"county", 1:"population", 2:"gdp", 3:"area"})

Unnamed: 0,The Bronx,Brooklyn,Manhattan,Queens,Staten Island
county,Bronx,Kings,New York,Queens,Richmond
population,1418207,2559903,1628706,2253858,476143
gdp,42.695,91.559,600.244,93.31,14.514
area,2.1,70.82,22.83,108.53,58.37


In [97]:
new_york

Unnamed: 0,The Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Bronx,Kings,New York,Queens,Richmond
1,1418207,2559903,1628706,2253858,476143
2,42.695,91.559,600.244,93.31,14.514
3,2.1,70.82,22.83,108.53,58.37


In [98]:
new_york = new_york.transpose()
new_york

Unnamed: 0,0,1,2,3
The Bronx,Bronx,1418207,42.695,2.1
Brooklyn,Kings,2559903,91.559,70.82
Manhattan,New York,1628706,600.244,22.83
Queens,Queens,2253858,93.31,108.53
Staten Island,Richmond,476143,14.514,58.37


In [99]:
new_york.rename(columns={0:"county", 1:"population", 2:"gdp", 3:"area"})

Unnamed: 0,county,population,gdp,area
The Bronx,Bronx,1418207,42.695,2.1
Brooklyn,Kings,2559903,91.559,70.82
Manhattan,New York,1628706,600.244,22.83
Queens,Queens,2253858,93.31,108.53
Staten Island,Richmond,476143,14.514,58.37


In [100]:
burroughs = ['The Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island']
counties = ['Bronx', 'Kings', 'New York', 'Queens', 'Richmond']
populations = [1_418_207, 2_559_903, 1_628_706, 2_253_858, 476_143]
gdp = [42.695, 91.559, 600.244, 93.310, 14.514]
areas = [42.10, 70.82, 22.83, 108.53, 58.37]

In [101]:
data = [burroughs, counties, populations, gdp, areas]
data

[['The Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island'],
 ['Bronx', 'Kings', 'New York', 'Queens', 'Richmond'],
 [1418207, 2559903, 1628706, 2253858, 476143],
 [42.695, 91.559, 600.244, 93.31, 14.514],
 [42.1, 70.82, 22.83, 108.53, 58.37]]

In [103]:
new_york = pd.DataFrame(data=data, index=['burroughs', 'county', 'population', 'gdp', 'area'])
new_york

Unnamed: 0,0,1,2,3,4
burroughs,The Bronx,Brooklyn,Manhattan,Queens,Staten Island
county,Bronx,Kings,New York,Queens,Richmond
population,1418207,2559903,1628706,2253858,476143
gdp,42.695,91.559,600.244,93.31,14.514
area,42.1,70.82,22.83,108.53,58.37


In [104]:
new_york = new_york.transpose()
new_york

Unnamed: 0,burroughs,county,population,gdp,area
0,The Bronx,Bronx,1418207,42.695,42.1
1,Brooklyn,Kings,2559903,91.559,70.82
2,Manhattan,New York,1628706,600.244,22.83
3,Queens,Queens,2253858,93.31,108.53
4,Staten Island,Richmond,476143,14.514,58.37


In [105]:
new_york.set_index('burroughs')

Unnamed: 0_level_0,county,population,gdp,area
burroughs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Bronx,Bronx,1418207,42.695,42.1
Brooklyn,Kings,2559903,91.559,70.82
Manhattan,New York,1628706,600.244,22.83
Queens,Queens,2253858,93.31,108.53
Staten Island,Richmond,476143,14.514,58.37


In [106]:
new_york.set_index('county')

Unnamed: 0_level_0,burroughs,population,gdp,area
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,The Bronx,1418207,42.695,42.1
Kings,Brooklyn,2559903,91.559,70.82
New York,Manhattan,1628706,600.244,22.83
Queens,Queens,2253858,93.31,108.53
Richmond,Staten Island,476143,14.514,58.37


In [107]:
new_york = new_york.set_index('burroughs')

In [108]:
new_york.index

Index(['The Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island'], dtype='object', name='burroughs')

In [109]:
new_york.columns

Index(['county', 'population', 'gdp', 'area'], dtype='object')

In [110]:
new_df = new_york.drop(columns='county')
new_df

Unnamed: 0_level_0,population,gdp,area
burroughs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Bronx,1418207,42.695,42.1
Brooklyn,2559903,91.559,70.82
Manhattan,1628706,600.244,22.83
Queens,2253858,93.31,108.53
Staten Island,476143,14.514,58.37


In [111]:
new_df = new_df.drop(index=['Brooklyn', 'Queens'])
new_df

Unnamed: 0_level_0,population,gdp,area
burroughs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Bronx,1418207,42.695,42.1
Manhattan,1628706,600.244,22.83
Staten Island,476143,14.514,58.37


# selecting data from dataframes

In [112]:
arr = np.arange(9).reshape(3,3)
df = pd.DataFrame(
    arr,
    columns = ['c1', 'c2', 'c3'],
    index = ['r1', 'r2', 'r3']
)
df

Unnamed: 0,c1,c2,c3
r1,0,1,2
r2,3,4,5
r3,6,7,8


In [113]:
df.index

Index(['r1', 'r2', 'r3'], dtype='object')

In [114]:
type(df['c2'])

pandas.core.series.Series

In [115]:
df['c2']

r1    1
r2    4
r3    7
Name: c2, dtype: int64

In [117]:
df.values

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [119]:
df.iloc[1,2], df.loc['r2', 'c3']

(np.int64(5), np.int64(5))

In [121]:
df.loc['r1':'r2',:]

Unnamed: 0,c1,c2,c3
r1,0,1,2
r2,3,4,5


In [122]:
df.iloc[:,[0,2]]

Unnamed: 0,c1,c3
r1,0,2
r2,3,5
r3,6,8


# Missing Value

In [127]:
s = pd.Series([1,2,np.nan, None])
s

0    1.0
1    2.0
2    NaN
3    NaN
dtype: float64

In [130]:
s[~s.isnull()]

0    1.0
1    2.0
dtype: float64

In [132]:
a = np.array([1,2, np.nan, None])
a

array([1, 2, nan, None], dtype=object)

In [134]:
# numpy doesnt have this method
a.isnull()

AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [135]:
s.dropna()

0    1.0
1    2.0
dtype: float64

In [139]:
s.fillna(method='ffill')

  s.fillna(method='ffill')


0    1.0
1    2.0
2    2.0
3    2.0
dtype: float64

In [149]:
s = pd.Series([1,2,3,None,13, 21, np.nan,42])
s

0     1.0
1     2.0
2     3.0
3     NaN
4    13.0
5    21.0
6     NaN
7    42.0
dtype: float64

In [150]:
s.interpolate(method='polynomial', order=2)

0     1.000000
1     2.000000
2     3.000000
3     6.754508
4    13.000000
5    21.000000
6    30.671290
7    42.000000
dtype: float64

In [152]:
s.interpolate(method='linear')

0     1.0
1     2.0
2     3.0
3     8.0
4    13.0
5    21.0
6    31.5
7    42.0
dtype: float64

In [153]:
d = {
    'col1': {'row1': 1, 'row2': 10, 'row3': 100, 'row4': 1000, 'row5': 10000},
    'col2': {'row1': 2, 'row2': None, 'row3': None, 'row4': 2000, 'row5': 20000},
    'col3': {'row1': 3, 'row2': 30, 'row3': 300, 'row4': None, 'row5': 40000},
    'col4': {'row1': 4, 'row2': 40, 'row3': 400, 'row4': 4000, 'row5': 40000}
}

df = pd.DataFrame(d)
df

Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row2,10,,30.0,40
row3,100,,300.0,400
row4,1000,2000.0,,4000
row5,10000,20000.0,40000.0,40000


In [154]:
df.isnull()

Unnamed: 0,col1,col2,col3,col4
row1,False,False,False,False
row2,False,True,False,False
row3,False,True,False,False
row4,False,False,True,False
row5,False,False,False,False


In [156]:
df.fillna(0)

Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row2,10,0.0,30.0,40
row3,100,0.0,300.0,400
row4,1000,2000.0,0.0,4000
row5,10000,20000.0,40000.0,40000


In [157]:
df.fillna(method='ffill', axis=0)

  df.fillna(method='ffill', axis=0)


Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row2,10,2.0,30.0,40
row3,100,2.0,300.0,400
row4,1000,2000.0,300.0,4000
row5,10000,20000.0,40000.0,40000


In [158]:
df.fillna(method='ffill', axis=1)

  df.fillna(method='ffill', axis=1)


Unnamed: 0,col1,col2,col3,col4
row1,1.0,2.0,3.0,4.0
row2,10.0,10.0,30.0,40.0
row3,100.0,100.0,300.0,400.0
row4,1000.0,2000.0,2000.0,4000.0
row5,10000.0,20000.0,40000.0,40000.0


In [159]:
df

Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row2,10,,30.0,40
row3,100,,300.0,400
row4,1000,2000.0,,4000
row5,10000,20000.0,40000.0,40000


In [160]:
df.dropna(axis=0)

Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row5,10000,20000.0,40000.0,40000


In [161]:
df.dropna(axis=1)

Unnamed: 0,col1,col4
row1,1,4
row2,10,40
row3,100,400
row4,1000,4000
row5,10000,40000
