# Data manipulation notes

# Indexing and slicing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df=sns.load_dataset('tips')

In [3]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [4]:
df.index

RangeIndex(start=0, stop=244, step=1)

make column as index

In [5]:
df_ind=df.set_index('sex')

removing an index

In [6]:
df_ind.reset_index()

Unnamed: 0,sex,total_bill,tip,smoker,day,time,size
0,Female,16.99,1.01,No,Sun,Dinner,2
1,Male,10.34,1.66,No,Sun,Dinner,3
2,Male,21.01,3.50,No,Sun,Dinner,3
3,Male,23.68,3.31,No,Sun,Dinner,2
4,Female,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,Male,29.03,5.92,No,Sat,Dinner,3
240,Female,27.18,2.00,Yes,Sat,Dinner,2
241,Male,22.67,2.00,Yes,Sat,Dinner,2
242,Male,17.82,1.75,No,Sat,Dinner,2


droping an index

In [7]:
df_ind.reset_index(drop=True)

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3
240,27.18,2.00,Yes,Sat,Dinner,2
241,22.67,2.00,Yes,Sat,Dinner,2
242,17.82,1.75,No,Sat,Dinner,2


# indexing makes subsetting simpler

In [8]:
df[df['sex'].isin(['Male'])]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
239,29.03,5.92,Male,No,Sat,Dinner,3
241,22.67,2.00,Male,Yes,Sat,Dinner,2


# index dont need to be unique

In [9]:
df_ind=df.set_index('sex')

# subsetting duplicated indexed value

In [10]:
df_ind.loc['Female']

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,16.99,1.01,No,Sun,Dinner,2
Female,24.59,3.61,No,Sun,Dinner,4
Female,35.26,5.00,No,Sun,Dinner,4
Female,14.83,3.02,No,Sun,Dinner,2
Female,10.33,1.67,No,Sun,Dinner,3
...,...,...,...,...,...,...
Female,10.09,2.00,Yes,Fri,Lunch,2
Female,22.12,2.88,Yes,Sat,Dinner,2
Female,35.83,4.67,No,Sat,Dinner,3
Female,27.18,2.00,Yes,Sat,Dinner,2


# Multi level index

In [11]:
df_ind2=df.set_index(['sex','smoker'])

# subsetting inner levels with a list of tuples

In [12]:
df_ind2.loc[[('Male','Yes'),('Female','Yes')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day,time,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,Yes,38.01,3.00,Sat,Dinner,4
Male,Yes,11.24,1.76,Sat,Dinner,2
Male,Yes,20.29,3.21,Sat,Dinner,2
Male,Yes,13.81,2.00,Sat,Dinner,2
Male,Yes,11.02,1.98,Sat,Dinner,2
...,...,...,...,...,...,...
Female,Yes,13.42,3.48,Fri,Lunch,2
Female,Yes,16.27,2.50,Fri,Lunch,2
Female,Yes,10.09,2.00,Fri,Lunch,2
Female,Yes,22.12,2.88,Sat,Dinner,2


# sorting by index value

In [13]:
df_ind2.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day,time,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,Yes,38.01,3.00,Sat,Dinner,4
Male,Yes,11.24,1.76,Sat,Dinner,2
Male,Yes,20.29,3.21,Sat,Dinner,2
Male,Yes,13.81,2.00,Sat,Dinner,2
Male,Yes,11.02,1.98,Sat,Dinner,2
...,...,...,...,...,...,...
Female,No,13.39,2.61,Sun,Dinner,2
Female,No,16.21,2.00,Sun,Dinner,3
Female,No,15.98,3.00,Fri,Lunch,3
Female,No,35.83,4.67,Sat,Dinner,3


# controlling sort index

In [14]:
df_ind2.sort_index(level=['smoker','sex'],ascending=[True,False])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day,time,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,Yes,3.07,1.00,Sat,Dinner,1
Female,Yes,26.86,3.14,Sat,Dinner,2
Female,Yes,25.28,5.00,Sat,Dinner,2
Female,Yes,5.75,1.00,Fri,Dinner,2
Female,Yes,16.32,4.30,Fri,Dinner,2
...,...,...,...,...,...,...
Male,No,11.61,3.39,Sat,Dinner,2
Male,No,10.77,1.47,Sat,Dinner,2
Male,No,10.07,1.25,Sat,Dinner,2
Male,No,29.03,5.92,Sat,Dinner,3


# slicing list

In [15]:
names=['deeeba','saad','ali','mariam','azka','bilal']

In [16]:
names[2:5]

['ali', 'mariam', 'azka']

In [17]:
names[:]

['deeeba', 'saad', 'ali', 'mariam', 'azka', 'bilal']


# sort the index before u slice

In [18]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [19]:
df1=df.set_index(['sex','smoker']).sort_index()

In [20]:
df1.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day,time,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,Yes,38.01,3.0,Sat,Dinner,4
Male,Yes,11.24,1.76,Sat,Dinner,2
Male,Yes,20.29,3.21,Sat,Dinner,2


In [21]:
df.loc['Yes':'No']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size


In [22]:
#column slicing

In [23]:
df.loc[1:5,'sex':'day']

Unnamed: 0,sex,smoker,day
1,Male,No,Sun
2,Male,No,Sun
3,Male,No,Sun
4,Female,No,Sun
5,Male,No,Sun


# slice twice

In [30]:
df_ind2.loc[('Female','Yes'),'total_bill':'day']

  return self._getitem_tuple(key)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Yes,3.07,1.0,Sat
Female,Yes,26.86,3.14,Sat
Female,Yes,25.28,5.0,Sat
Female,Yes,5.75,1.0,Fri
Female,Yes,16.32,4.3,Fri
Female,Yes,11.35,2.5,Fri
Female,Yes,15.38,3.0,Fri
Female,Yes,44.3,2.5,Sat
Female,Yes,22.42,3.48,Sat
Female,Yes,14.31,4.0,Sat


# subsetting by rows/columns numbers

In [40]:
df.iloc[1:3,2:4]

Unnamed: 0,sex,smoker
1,Male,No
2,Male,No


In [44]:
df.mean(axis=0)

total_bill    19.785943
tip            2.998279
size           2.569672
dtype: float64