In [3]:
# How to import pandas

import pandas as pd

# Pandas is a Python package providing fast, flexible, and expressive data structures designed 
# to make working with “relational” or “labelled” data both easy and intuitive. 
# Also, it aims to be the fundamental high-level building block for doing practical, 
# real-world data analysis in Python. 

In [9]:
# How to load data

fileObj=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')

print(type(fileObj))
print(fileObj) # Here pandas treating first row as column header

# How to make copy of fileObj so that any change you do in copy does not reflects on fileObj
df=fileObj.copy()
print(df)

<class 'pandas.core.frame.DataFrame'>
     5.1  3.5  1.4  0.2     Iris-setosa
0    4.9  3.0  1.4  0.2     Iris-setosa
1    4.7  3.2  1.3  0.2     Iris-setosa
2    4.6  3.1  1.5  0.2     Iris-setosa
3    5.0  3.6  1.4  0.2     Iris-setosa
4    5.4  3.9  1.7  0.4     Iris-setosa
..   ...  ...  ...  ...             ...
144  6.7  3.0  5.2  2.3  Iris-virginica
145  6.3  2.5  5.0  1.9  Iris-virginica
146  6.5  3.0  5.2  2.0  Iris-virginica
147  6.2  3.4  5.4  2.3  Iris-virginica
148  5.9  3.0  5.1  1.8  Iris-virginica

[149 rows x 5 columns]
     5.1  3.5  1.4  0.2     Iris-setosa
0    4.9  3.0  1.4  0.2     Iris-setosa
1    4.7  3.2  1.3  0.2     Iris-setosa
2    4.6  3.1  1.5  0.2     Iris-setosa
3    5.0  3.6  1.4  0.2     Iris-setosa
4    5.4  3.9  1.7  0.4     Iris-setosa
..   ...  ...  ...  ...             ...
144  6.7  3.0  5.2  2.3  Iris-virginica
145  6.3  2.5  5.0  1.9  Iris-virginica
146  6.5  3.0  5.2  2.0  Iris-virginica
147  6.2  3.4  5.4  2.3  Iris-virginica
148  5.9  3.0  5.1

In [31]:
# df.head()

# To see some initial entries

print(df.head())

# To see specific number of enteries(lets say 10 entries)
print(df.head(10))

   5.1  3.5  1.4  0.2  Iris-setosa
0  4.9  3.0  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
   5.1  3.5  1.4  0.2  Iris-setosa
0  4.9  3.0  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
5  4.6  3.4  1.4  0.3  Iris-setosa
6  5.0  3.4  1.5  0.2  Iris-setosa
7  4.4  2.9  1.4  0.2  Iris-setosa
8  4.9  3.1  1.5  0.1  Iris-setosa
9  5.4  3.7  1.5  0.2  Iris-setosa


In [14]:
# To see column headers
print(df.columns)

# Change column headers
df.columns=['sl','sw','pl','pw','flower_type']

Index(['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], dtype='object')


In [34]:
# Find shape of data frame
print(df.shape) # Dimensions of data frame
print(df.dtypes) # Data type of each column

(149, 5)
sl             float64
sw             float64
pl             float64
pw             float64
flower_type     object
dtype: object


In [35]:
# Basic info of table/data frame
print(df.describe())

               sl          sw          pl          pw
count  149.000000  149.000000  149.000000  149.000000
mean     5.848322    3.051007    3.774497    1.205369
std      0.828594    0.433499    1.759651    0.761292
min      4.300000    2.000000    1.000000    0.100000
25%      5.100000    2.800000    1.600000    0.300000
50%      5.800000    3.000000    4.400000    1.300000
75%      6.400000    3.300000    5.100000    1.800000
max      7.900000    4.400000    6.900000    2.500000


In [22]:
# How to access particular column

# df.column_name or df["column_name"]
print(df.sl)
print(df['sl'])
print(type(df.sl))

0      4.9
1      4.7
2      4.6
3      5.0
4      5.4
      ... 
144    6.7
145    6.3
146    6.5
147    6.2
148    5.9
Name: sl, Length: 149, dtype: float64
0      4.9
1      4.7
2      4.6
3      5.0
4      5.4
      ... 
144    6.7
145    6.3
146    6.5
147    6.2
148    5.9
Name: sl, Length: 149, dtype: float64
<class 'pandas.core.series.Series'>


In [24]:
# Check for null entries
print(df.isnull())
print(df.isnull().sum()) # This will show which column has how many null entries

        sl     sw     pl     pw  flower_type
0    False  False  False  False        False
1    False  False  False  False        False
2    False  False  False  False        False
3    False  False  False  False        False
4    False  False  False  False        False
..     ...    ...    ...    ...          ...
144  False  False  False  False        False
145  False  False  False  False        False
146  False  False  False  False        False
147  False  False  False  False        False
148  False  False  False  False        False

[149 rows x 5 columns]
sl             0
sw             0
pl             0
pw             0
flower_type    0
dtype: int64


In [26]:
# How to get some part of data from data frame
# iloc is used for this
print(df.iloc[1:4,2:4])

    pl   pw
1  1.3  0.2
2  1.5  0.2
3  1.4  0.2


In [27]:
print(df.head())

    sl   sw   pl   pw  flower_type
0  4.9  3.0  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa


In [10]:
# Remove particular row
a=df.drop(0) # Remove 0th row and return update data frame
# It does not change in original data frame
print(df.head())
print(a.head())

df.drop(0,inplace=True) # This will make change in original data frame
# drop function takes label as parameter

   5.1  3.5  1.4  0.2  Iris-setosa
0  4.9  3.0  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
   5.1  3.5  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
5  4.6  3.4  1.4  0.3  Iris-setosa


In [11]:
# Check all labels
print(df.index)

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            139, 140, 141, 142, 143, 144, 145, 146, 147, 148],
           dtype='int64', length=148)


In [12]:
# Drop by position
df.drop(df.index[2],inplace=True)
print(df.head())

# How to delete on multiple position
# df.drop(df.index[[idx1,idx2]],inplace=True)

   5.1  3.5  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
5  4.6  3.4  1.4  0.3  Iris-setosa
6  5.0  3.4  1.5  0.2  Iris-setosa


In [17]:
df.sl>5
df[df.sl>5]

Unnamed: 0,sl,sw,pl,pw,flower_type
4,5.4,3.9,1.7,0.4,Iris-setosa
9,5.4,3.7,1.5,0.2,Iris-setosa
13,5.8,4.0,1.2,0.2,Iris-setosa
14,5.7,4.4,1.5,0.4,Iris-setosa
15,5.4,3.9,1.3,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [25]:
# df.loc[label]
print(df.head())
print(df.loc[2])

# How to add row
# df.loc[label]=[list of values]
df.loc[0]=[1,2,3,4,'Iris-setosa']
print(df.head())
print(df.loc[0])

    sl   sw   pl   pw  flower_type
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
5  4.6  3.4  1.4  0.3  Iris-setosa
6  5.0  3.4  1.5  0.2  Iris-setosa
sl                     4.6
sw                     3.1
pl                     1.5
pw                     0.2
flower_type    Iris-setosa
Name: 2, dtype: object
    sl   sw   pl   pw  flower_type
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
5  4.6  3.4  1.4  0.3  Iris-setosa
6  5.0  3.4  1.5  0.2  Iris-setosa
sl                       1
sw                       2
pl                       3
pw                       4
flower_type    Iris-setosa
Name: 0, dtype: object


In [27]:
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica
0,1.0,2.0,3.0,4.0,Iris-setosa


In [29]:
# To get list all rows except header(list of rows)
print(df.values)

[[4.7 3.2 1.3 0.2 'Iris-setosa']
 [4.6 3.1 1.5 0.2 'Iris-setosa']
 [5.4 3.9 1.7 0.4 'Iris-setosa']
 [4.6 3.4 1.4 0.3 'Iris-setosa']
 [5.0 3.4 1.5 0.2 'Iris-setosa']
 [4.4 2.9 1.4 0.2 'Iris-setosa']
 [4.9 3.1 1.5 0.1 'Iris-setosa']
 [5.4 3.7 1.5 0.2 'Iris-setosa']
 [4.8 3.4 1.6 0.2 'Iris-setosa']
 [4.8 3.0 1.4 0.1 'Iris-setosa']
 [4.3 3.0 1.1 0.1 'Iris-setosa']
 [5.8 4.0 1.2 0.2 'Iris-setosa']
 [5.7 4.4 1.5 0.4 'Iris-setosa']
 [5.4 3.9 1.3 0.4 'Iris-setosa']
 [5.1 3.5 1.4 0.3 'Iris-setosa']
 [5.7 3.8 1.7 0.3 'Iris-setosa']
 [5.1 3.8 1.5 0.3 'Iris-setosa']
 [5.4 3.4 1.7 0.2 'Iris-setosa']
 [5.1 3.7 1.5 0.4 'Iris-setosa']
 [4.6 3.6 1.0 0.2 'Iris-setosa']
 [5.1 3.3 1.7 0.5 'Iris-setosa']
 [4.8 3.4 1.9 0.2 'Iris-setosa']
 [5.0 3.0 1.6 0.2 'Iris-setosa']
 [5.0 3.4 1.6 0.4 'Iris-setosa']
 [5.2 3.5 1.5 0.2 'Iris-setosa']
 [5.2 3.4 1.4 0.2 'Iris-setosa']
 [4.7 3.2 1.6 0.2 'Iris-setosa']
 [4.8 3.1 1.6 0.2 'Iris-setosa']
 [5.4 3.4 1.5 0.4 'Iris-setosa']
 [5.2 4.1 1.5 0.1 'Iris-setosa']
 [5.5 4.2 

In [33]:
# Count of particular value from particular column
# In this case colum is flower_type and value id Iris-setosa
df['flower_type'].value_counts()['Iris-setosa']

48

In [44]:
df['flower_type'].value_counts()

Iris-versicolor    50
Iris-virginica     50
Iris-setosa        48
Name: flower_type, dtype: int64

In [45]:
# How to reset indexes
print(df)
df.reset_index(drop=True,inplace=True)
print(df)

      sl   sw   pl   pw     flower_type
1    4.7  3.2  1.3  0.2     Iris-setosa
2    4.6  3.1  1.5  0.2     Iris-setosa
4    5.4  3.9  1.7  0.4     Iris-setosa
5    4.6  3.4  1.4  0.3     Iris-setosa
6    5.0  3.4  1.5  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
145  6.3  2.5  5.0  1.9  Iris-virginica
146  6.5  3.0  5.2  2.0  Iris-virginica
147  6.2  3.4  5.4  2.3  Iris-virginica
148  5.9  3.0  5.1  1.8  Iris-virginica
0    1.0  2.0  3.0  4.0     Iris-setosa

[148 rows x 5 columns]
      sl   sw   pl   pw     flower_type
0    4.7  3.2  1.3  0.2     Iris-setosa
1    4.6  3.1  1.5  0.2     Iris-setosa
2    5.4  3.9  1.7  0.4     Iris-setosa
3    4.6  3.4  1.4  0.3     Iris-setosa
4    5.0  3.4  1.5  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
143  6.3  2.5  5.0  1.9  Iris-virginica
144  6.5  3.0  5.2  2.0  Iris-virginica
145  6.2  3.4  5.4  2.3  Iris-virginica
146  5.9  3.0  5.1  1.8  Iris-virginica
147  1.0  2.0  3.0  4.0     Iris-setosa

[148 rows x 5 c

In [46]:
# How to delete column
print(df)
df.drop('sl',axis=1,inplace=True)
print(df)

      sl   sw   pl   pw     flower_type
0    4.7  3.2  1.3  0.2     Iris-setosa
1    4.6  3.1  1.5  0.2     Iris-setosa
2    5.4  3.9  1.7  0.4     Iris-setosa
3    4.6  3.4  1.4  0.3     Iris-setosa
4    5.0  3.4  1.5  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
143  6.3  2.5  5.0  1.9  Iris-virginica
144  6.5  3.0  5.2  2.0  Iris-virginica
145  6.2  3.4  5.4  2.3  Iris-virginica
146  5.9  3.0  5.1  1.8  Iris-virginica
147  1.0  2.0  3.0  4.0     Iris-setosa

[148 rows x 5 columns]
      sw   pl   pw     flower_type
0    3.2  1.3  0.2     Iris-setosa
1    3.1  1.5  0.2     Iris-setosa
2    3.9  1.7  0.4     Iris-setosa
3    3.4  1.4  0.3     Iris-setosa
4    3.4  1.5  0.2     Iris-setosa
..   ...  ...  ...             ...
143  2.5  5.0  1.9  Iris-virginica
144  3.0  5.2  2.0  Iris-virginica
145  3.4  5.4  2.3  Iris-virginica
146  3.0  5.1  1.8  Iris-virginica
147  2.0  3.0  4.0     Iris-setosa

[148 rows x 4 columns]


In [47]:
# Another way to delete column
print(df.describe())
del df['sw']
print(df.describe())

               sw          pl          pw
count  148.000000  148.000000  148.000000
mean     3.040541    3.801351    1.237838
std      0.441062    1.744729    0.788558
min      2.000000    1.000000    0.100000
25%      2.800000    1.600000    0.300000
50%      3.000000    4.400000    1.300000
75%      3.300000    5.100000    1.800000
max      4.400000    6.900000    4.000000
               pl          pw
count  148.000000  148.000000
mean     3.801351    1.237838
std      1.744729    0.788558
min      1.000000    0.100000
25%      1.600000    0.300000
50%      4.400000    1.300000
75%      5.100000    1.800000
max      6.900000    4.000000


In [58]:
df=fileObj.copy()
df.columns=['sl','sw','pl','pw','flower_type']

In [59]:
# How to add new column
print(df.head())
df['new_coluumn']=df['pl']-df['pw']
print(df.head())
df["new"]=1
print(df.head())

    sl   sw   pl   pw  flower_type
0  4.9  3.0  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
    sl   sw   pl   pw  flower_type  new_coluumn
0  4.9  3.0  1.4  0.2  Iris-setosa          1.2
1  4.7  3.2  1.3  0.2  Iris-setosa          1.1
2  4.6  3.1  1.5  0.2  Iris-setosa          1.3
3  5.0  3.6  1.4  0.2  Iris-setosa          1.2
4  5.4  3.9  1.7  0.4  Iris-setosa          1.3
    sl   sw   pl   pw  flower_type  new_coluumn  new
0  4.9  3.0  1.4  0.2  Iris-setosa          1.2    1
1  4.7  3.2  1.3  0.2  Iris-setosa          1.1    1
2  4.6  3.1  1.5  0.2  Iris-setosa          1.3    1
3  5.0  3.6  1.4  0.2  Iris-setosa          1.2    1
4  5.4  3.9  1.7  0.4  Iris-setosa          1.3    1


In [60]:
# Handle Nan
import numpy as np

df.iloc[1:4,2:3]=np.nan
print(df.describe())

# df.dropna() deletes row having nan values and return modified data frame
# To change in existing data frame: df.dropna(inplace=True)
df.dropna(inplace=True)
print(df.head())
df.reset_index(drop=True,inplace=True)

               sl          sw          pl          pw  new_coluumn    new
count  149.000000  149.000000  146.000000  149.000000   149.000000  149.0
mean     5.848322    3.051007    3.823288    1.205369     2.569128    1.0
std      0.828594    0.433499    1.743917    0.761292     1.047707    0.0
min      4.300000    2.000000    1.000000    0.100000     0.800000    1.0
25%      5.100000    2.800000    1.600000    0.300000     1.400000    1.0
50%      5.800000    3.000000    4.400000    1.300000     2.900000    1.0
75%      6.400000    3.300000    5.100000    1.800000     3.300000    1.0
max      7.900000    4.400000    6.900000    2.500000     4.700000    1.0
    sl   sw   pl   pw  flower_type  new_coluumn  new
0  4.9  3.0  1.4  0.2  Iris-setosa          1.2    1
4  5.4  3.9  1.7  0.4  Iris-setosa          1.3    1
5  4.6  3.4  1.4  0.3  Iris-setosa          1.1    1
6  5.0  3.4  1.5  0.2  Iris-setosa          1.3    1
7  4.4  2.9  1.4  0.2  Iris-setosa          1.2    1


In [63]:
# Fill nan with mean values
df.iloc[1:4,2:3]=np.nan
print(df.head())
df.pl.fillna(df.pl.mean(),inplace=True)
print(df.head())

    sl   sw   pl   pw  flower_type  new_coluumn  new
0  4.9  3.0  1.4  0.2  Iris-setosa          1.2    1
1  5.4  3.9  NaN  0.4  Iris-setosa          1.3    1
2  4.6  3.4  NaN  0.3  Iris-setosa          1.1    1
3  5.0  3.4  NaN  0.2  Iris-setosa          1.3    1
4  4.4  2.9  1.4  0.2  Iris-setosa          1.2    1
    sl   sw        pl   pw  flower_type  new_coluumn  new
0  4.9  3.0  1.400000  0.2  Iris-setosa          1.2    1
1  5.4  3.9  3.871329  0.4  Iris-setosa          1.3    1
2  4.6  3.4  3.871329  0.3  Iris-setosa          1.1    1
3  5.0  3.4  3.871329  0.2  Iris-setosa          1.3    1
4  4.4  2.9  1.400000  0.2  Iris-setosa          1.2    1


In [65]:
# Handle String based data
df['Gender']='Female'
df.iloc[0:10,7]='Male'
print(df.head())
print(df.tail())

def f(str):
    if str=='Male':
        return 0;
    else:
        return 1
    
df['Sex']=df.Gender.apply(f) # function f will be called for each entry of column Gender
print(df.head())
print(df.tail())


    sl   sw        pl   pw  flower_type  new_coluumn  new Gender
0  4.9  3.0  1.400000  0.2  Iris-setosa          1.2    1   Male
1  5.4  3.9  3.871329  0.4  Iris-setosa          1.3    1   Male
2  4.6  3.4  3.871329  0.3  Iris-setosa          1.1    1   Male
3  5.0  3.4  3.871329  0.2  Iris-setosa          1.3    1   Male
4  4.4  2.9  1.400000  0.2  Iris-setosa          1.2    1   Male
      sl   sw   pl   pw     flower_type  new_coluumn  new  Gender
141  6.7  3.0  5.2  2.3  Iris-virginica          2.9    1  Female
142  6.3  2.5  5.0  1.9  Iris-virginica          3.1    1  Female
143  6.5  3.0  5.2  2.0  Iris-virginica          3.2    1  Female
144  6.2  3.4  5.4  2.3  Iris-virginica          3.1    1  Female
145  5.9  3.0  5.1  1.8  Iris-virginica          3.3    1  Female
    sl   sw        pl   pw  flower_type  new_coluumn  new Gender  Sex
0  4.9  3.0  1.400000  0.2  Iris-setosa          1.2    1   Male    0
1  5.4  3.9  3.871329  0.4  Iris-setosa          1.3    1   Male    0
2  4