In [1]:
import numpy as np
import pandas as pd

# Data Replacing

In [2]:
df = pd.read_csv('Data/clothing.csv')
df

Unnamed: 0,Type,Color,Size
0,Tshirt,Black,M
1,Socks,Red,L
2,Sweater,Blue,S
3,Sweater,Bluex,M
4,Tshirt,White,M
5,Sweater,Carmine,XL
6,Socks,Red,L
7,Tshirt,Black,M
8,Socks,Brown,S


In [3]:
df.replace('Carmine', 'Red',inplace=True)
df.replace('Bluex','Blue',inplace=True)
df

Unnamed: 0,Type,Color,Size
0,Tshirt,Black,M
1,Socks,Red,L
2,Sweater,Blue,S
3,Sweater,Blue,M
4,Tshirt,White,M
5,Sweater,Red,XL
6,Socks,Red,L
7,Tshirt,Black,M
8,Socks,Brown,S


# Data Duplicated

In [4]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8    False
dtype: bool

In [5]:
df.drop_duplicates()

Unnamed: 0,Type,Color,Size
0,Tshirt,Black,M
1,Socks,Red,L
2,Sweater,Blue,S
3,Sweater,Blue,M
4,Tshirt,White,M
5,Sweater,Red,XL
8,Socks,Brown,S


In [6]:
df.drop_duplicates(keep='last', inplace=True)
df

Unnamed: 0,Type,Color,Size
2,Sweater,Blue,S
3,Sweater,Blue,M
4,Tshirt,White,M
5,Sweater,Red,XL
6,Socks,Red,L
7,Tshirt,Black,M
8,Socks,Brown,S


# Renaming Axis Indexes

In [7]:
df = pd.read_csv('Data/books.csv')
df

Unnamed: 0,ID,Title,Author,PublicationDate
0,001276A,The Rise of the Falcon,John Admiral,25-Apr-2018
1,023125B,Controlled mind,Robert Greens,28-Aug-2016
2,005556E,Only love remains,Greta Blooming,17-Feb-2015


In [8]:
df = pd.read_csv('Data/books.csv', index_col=0)
df

Unnamed: 0_level_0,Title,Author,PublicationDate
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
001276A,The Rise of the Falcon,John Admiral,25-Apr-2018
023125B,Controlled mind,Robert Greens,28-Aug-2016
005556E,Only love remains,Greta Blooming,17-Feb-2015


In [9]:
df.rename(columns = {'PublicationDate':'Publication'}, inplace=True)
df

Unnamed: 0_level_0,Title,Author,Publication
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
001276A,The Rise of the Falcon,John Admiral,25-Apr-2018
023125B,Controlled mind,Robert Greens,28-Aug-2016
005556E,Only love remains,Greta Blooming,17-Feb-2015


In [10]:
df.rename(index=str.lower, columns=str.upper, inplace=True)
df

Unnamed: 0_level_0,TITLE,AUTHOR,PUBLICATION
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
001276a,The Rise of the Falcon,John Admiral,25-Apr-2018
023125b,Controlled mind,Robert Greens,28-Aug-2016
005556e,Only love remains,Greta Blooming,17-Feb-2015


In [11]:
limiter = lambda x: x[:5]
df.index = df.index.map(limiter)
df

Unnamed: 0_level_0,TITLE,AUTHOR,PUBLICATION
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
127,The Rise of the Falcon,John Admiral,25-Apr-2018
2312,Controlled mind,Robert Greens,28-Aug-2016
555,Only love remains,Greta Blooming,17-Feb-2015


# Sparse DataFrames

In [12]:
arr = np.random.randn(10000)
arr[arr < 1] = np.nan
df = pd.DataFrame(arr.reshape(100,100))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,,,1.093739,1.503808,,,1.161024,,1.202064,...,,,,,,,1.157932,,,
1,,1.288538,,,,,,,,1.067415,...,,2.319867,,,,,,,1.106264,1.375355
2,1.512870,,1.025546,,,1.416279,,,,,...,,,,,,,,,,
3,,,,,,,,,,1.061472,...,,,,,,1.496095,,1.335997,,
4,1.365769,1.486298,1.308738,,,1.115126,,1.057677,,,...,,1.726923,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,1.290535,,1.258029,...,1.331731,1.030543,,,1.335026,,1.419259,,1.185087,
96,1.590125,1.043417,1.150501,,,,,,,,...,1.003366,,,,1.568530,,,,,
97,1.446547,,,,,,,,,,...,,,,,,,,,,3.217726
98,1.017807,,,,1.472115,,1.401377,,,,...,1.099525,,,,,,,,,


In [13]:
'{:0.2f} bytes'.format(df.memory_usage().sum() / 1e3)

'80.13 bytes'

In [15]:
sdf = df.astype(pd.SparseDtype("float", np.nan))

In [16]:
sdf.sparse.density

0.1649

In [17]:
'{:0.2f} bytes'.format(sdf.memory_usage().sum() / 1e3)

'19.92 bytes'