In [1]:
import numpy as np

In [2]:
import pandas as pd

### Object Creation

#### A DataFrame
A dataframe represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type. A dataframe has both a row and column index and can be thought of as a dict of Series, all sharing the same index.
A column in a dataframe can be retrieved as a Series.
The columns of the resulting DataFrame have different dtypes.

In [3]:
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_df = pd.read_csv(csv_url, header = None)

In [115]:
# The columns of the resulting DataFrame have different dtypes.
iris_df.dtypes

Sepal_Length    float64
Sepal_Width     float64
Petal_length    float64
Petal_Width     float64
Species          object
dtype: object

### Viewing Data
Can view the top and bottom of the DataFrame, the index and the column names

In [4]:

iris_df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
col_names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width','Species']

# read in the dataset from the UCI Machine Learning Repository link and specify column names to use
# save as iris_df
iris_df =  pd.read_csv(csv_url, names = col_names)

In [6]:
iris_df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [117]:
# View the index of the DataFrame
iris_df.index

RangeIndex(start=0, stop=150, step=1)

In [118]:
# View the columns of the DataFrame
iris_df.columns

Index(['Sepal_Length', 'Sepal_Width', 'Petal_length', 'Petal_Width',
       'Species'],
      dtype='object')

In [120]:
# sorting by an axis
iris_df.sort_index(axis=1, ascending=False).head(10)

Unnamed: 0,Species,Sepal_Width,Sepal_Length,Petal_length,Petal_Width
0,Iris-setosa,3.5,5.1,1.4,0.2
1,Iris-setosa,3.0,4.9,1.4,0.2
2,Iris-setosa,3.2,4.7,1.3,0.2
3,Iris-setosa,3.1,4.6,1.5,0.2
4,Iris-setosa,3.6,5.0,1.4,0.2
5,Iris-setosa,3.9,5.4,1.7,0.4
6,Iris-setosa,3.4,4.6,1.4,0.3
7,Iris-setosa,3.4,5.0,1.5,0.2
8,Iris-setosa,2.9,4.4,1.4,0.2
9,Iris-setosa,3.1,4.9,1.5,0.1


In [122]:
# sorting by values
iris_df.sort_values(by='Petal_Width').head(10)

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
32,5.2,4.1,1.5,0.1,Iris-setosa
13,4.3,3.0,1.1,0.1,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa
12,4.8,3.0,1.4,0.1,Iris-setosa
34,4.9,3.1,1.5,0.1,Iris-setosa
0,5.1,3.5,1.4,0.2,Iris-setosa
27,5.2,3.5,1.5,0.2,Iris-setosa
28,5.2,3.4,1.4,0.2,Iris-setosa
29,4.7,3.2,1.6,0.2,Iris-setosa


In [7]:
iris_df.describe()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Selection

#### Getting
can select a single column, which yields a Series.
Selecting via [] slices the rows.

It is recommended to use the optimized pandas data access methods, .at, .iat, .loc and .iloc.

In [None]:
# Selecting a single column, which yields a Series, equivalent to df.Sepal_Length

# can use labels in the index to select values or a set of values
iris_df['Sepal_Length'].head()


In [123]:
# Selecting via [], which slices the rows.
# Looking here at the first 5 rows
iris_df[0:5]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


#### Selection by Label

In [126]:
# Selecting on a multi-axis by label
iris_df.loc[0:10, ['Sepal_Length', 'Petal_length']]

Unnamed: 0,Sepal_Length,Petal_length
0,5.1,1.4
1,4.9,1.4
2,4.7,1.3
3,4.6,1.5
4,5.0,1.4
5,5.4,1.7
6,4.6,1.4
7,5.0,1.5
8,4.4,1.4
9,4.9,1.5


In [128]:
# reduction in the dimensions of the returned object
iris_df.loc[0, ['Sepal_Length', 'Petal_length']]

Sepal_Length    5.1
Petal_length    1.4
Name: 0, dtype: object

In [130]:
# a scalar value
iris_df.loc[0, 'Petal_length']


1.4

#### Selection by position

In [138]:
# Selection by position
iris_df.iloc[0:3, 0:4]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [140]:
# getting a scalar value by position using iat
iris_df.iat[0,0]

5.1

#### Boolean Indexing
Can use a single columns value to select data

In [9]:
# can filter with boolean expressions
iris_df[iris_df['Sepal_Length'] > 7]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
102,7.1,3.0,5.9,2.1,Iris-virginica
105,7.6,3.0,6.6,2.1,Iris-virginica
107,7.3,2.9,6.3,1.8,Iris-virginica
109,7.2,3.6,6.1,2.5,Iris-virginica
117,7.7,3.8,6.7,2.2,Iris-virginica
118,7.7,2.6,6.9,2.3,Iris-virginica
122,7.7,2.8,6.7,2.0,Iris-virginica
125,7.2,3.2,6.0,1.8,Iris-virginica
129,7.2,3.0,5.8,1.6,Iris-virginica
130,7.4,2.8,6.1,1.9,Iris-virginica


In [145]:
# Selecting values from a DataFrame where a boolean condition is met
iris_df[iris_df > 2].head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,,,Iris-setosa
1,4.9,3.0,,,Iris-setosa
2,4.7,3.2,,,Iris-setosa
3,4.6,3.1,,,Iris-setosa
4,5.0,3.6,,,Iris-setosa


In [10]:
iris_df[iris_df['Sepal_Length'] > 7]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
102,7.1,3.0,5.9,2.1,Iris-virginica
105,7.6,3.0,6.6,2.1,Iris-virginica
107,7.3,2.9,6.3,1.8,Iris-virginica
109,7.2,3.6,6.1,2.5,Iris-virginica
117,7.7,3.8,6.7,2.2,Iris-virginica
118,7.7,2.6,6.9,2.3,Iris-virginica
122,7.7,2.8,6.7,2.0,Iris-virginica
125,7.2,3.2,6.0,1.8,Iris-virginica
129,7.2,3.0,5.8,1.6,Iris-virginica
130,7.4,2.8,6.1,1.9,Iris-virginica


In [147]:
# using the isin method for filtering. 
iris_df[iris_df['Species'].isin(['Iris-virginica'])].head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
100,6.3,3.3,6.0,2.5,Iris-virginica
101,5.8,2.7,5.1,1.9,Iris-virginica
102,7.1,3.0,5.9,2.1,Iris-virginica
103,6.3,2.9,5.6,1.8,Iris-virginica
104,6.5,3.0,5.8,2.2,Iris-virginica


### Setting
Setting a new column automatically aligns the data by the indexes.


## Missing data
pandas primarily uses the value np.nan to represent missing data. It is by default not included in computations. 


In [11]:
# check for missing data
# can use the isnull() function to check for missing values
# gets the boolean mask where values are nan
pd.isnull(iris_df).head()
pd.notnull(iris_df).head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True


In [12]:
# pandas series also have these as instance methods

iris_df.isnull().head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


In [13]:
# also can use the notnull() function
iris_df.notnull().head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True


In [14]:
# retrieve a column of the dataframe using a dict-like notation
iris_df['Petal_length'].head()

0    1.4
1    1.4
2    1.3
3    1.5
4    1.4
Name: Petal_length, dtype: float64

In [15]:
# retrieve a column of data by attribute
iris_df.Sepal_Length.head()

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: Sepal_Length, dtype: float64

Rows can be retrieved by position or name with the loc attribute

In [16]:
# as a series using the loc attribute
iris_df.loc[0]

Sepal_Length            5.1
Sepal_Width             3.5
Petal_length            1.4
Petal_Width             0.2
Species         Iris-setosa
Name: 0, dtype: object

In [17]:
# as a DataFrame using the loc attribute
iris_df.loc[[0]]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa


Columns can be modified by assignment. Or a new column can be added if it doesn't exist by assignment,
If assigning lists or arrays to a column, the value's length must match the length of the dataFrame.
If assigning a series, it's labels will be assigned exactly to the DataFrames index with missing values inserted as needed.
Columns can be deleted using the `del` keyword


In [18]:
iris_df['obs']= np.arange(1,151)
iris_df.head()



Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species,obs
0,5.1,3.5,1.4,0.2,Iris-setosa,1
1,4.9,3.0,1.4,0.2,Iris-setosa,2
2,4.7,3.2,1.3,0.2,Iris-setosa,3
3,4.6,3.1,1.5,0.2,Iris-setosa,4
4,5.0,3.6,1.4,0.2,Iris-setosa,5


In [19]:
iris_df.tail()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species,obs
145,6.7,3.0,5.2,2.3,Iris-virginica,146
146,6.3,2.5,5.0,1.9,Iris-virginica,147
147,6.5,3.0,5.2,2.0,Iris-virginica,148
148,6.2,3.4,5.4,2.3,Iris-virginica,149
149,5.9,3.0,5.1,1.8,Iris-virginica,150


In [20]:
iris_df['new'] = iris_df.Species == 'Iris-setosa' 

In [21]:
iris_df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species,obs,new
0,5.1,3.5,1.4,0.2,Iris-setosa,1,True
1,4.9,3.0,1.4,0.2,Iris-setosa,2,True
2,4.7,3.2,1.3,0.2,Iris-setosa,3,True
3,4.6,3.1,1.5,0.2,Iris-setosa,4,True
4,5.0,3.6,1.4,0.2,Iris-setosa,5,True


In [22]:
del iris_df['new']
del iris_df['obs']
iris_df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [23]:
# could transpose the DataFrame

iris_df.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
Sepal_Length,5.1,4.9,4.7,4.6,5,5.4,4.6,5,4.4,4.9,...,6.7,6.9,5.8,6.8,6.7,6.7,6.3,6.5,6.2,5.9
Sepal_Width,3.5,3,3.2,3.1,3.6,3.9,3.4,3.4,2.9,3.1,...,3.1,3.1,2.7,3.2,3.3,3,2.5,3,3.4,3
Petal_length,1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,...,5.6,5.1,5.1,5.9,5.7,5.2,5,5.2,5.4,5.1
Petal_Width,0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2,0.2,0.1,...,2.4,2.3,1.9,2.3,2.5,2.3,1.9,2,2.3,1.8
Species,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,...,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica


In [150]:
iris_df[iris_df.Sepal_Width > 3.5].head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
10,5.4,3.7,1.5,0.2,Iris-setosa
14,5.8,4.0,1.2,0.2,Iris-setosa
15,5.7,4.4,1.5,0.4,Iris-setosa


In [151]:
# sorting by values
iris_df.sort_values(by='Petal_Width').head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
32,5.2,4.1,1.5,0.1,Iris-setosa
13,4.3,3.0,1.1,0.1,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa
12,4.8,3.0,1.4,0.1,Iris-setosa


In [25]:
iris_df.describe()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [72]:
iris_df[iris_df.Sepal_Length > 6.4].head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
50,7.0,3.2,4.7,1.4,Iris-versicolor
52,6.9,3.1,4.9,1.5,Iris-versicolor
54,6.5,2.8,4.6,1.5,Iris-versicolor
58,6.6,2.9,4.6,1.3,Iris-versicolor
65,6.7,3.1,4.4,1.4,Iris-versicolor


In [27]:
labels = pd.Index(np.arange(1,151))
labels

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            141, 142, 143, 144, 145, 146, 147, 148, 149, 150],
           dtype='int64', length=150)

In [28]:
# create a new iris dataframe with the index starting at 1 instead of 0
iris1 = iris_df.reindex(np.arange(1,151))

In [29]:
iris1.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa


In [30]:
iris_df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


`reindex` can alter either the row index, the column index or both.

### Indexing, Selection and Filtering


In [31]:
iris_df['Sepal_Length'].head()

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: Sepal_Length, dtype: float64

In [32]:
iris_df[0:5]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [33]:
iris_df[0:3]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


In [34]:
iris_df[iris_df['Sepal_Width'] > 4]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
15,5.7,4.4,1.5,0.4,Iris-setosa
32,5.2,4.1,1.5,0.1,Iris-setosa
33,5.5,4.2,1.4,0.2,Iris-setosa


In [35]:
iris_df[iris_df > 8].head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
0,,,,,Iris-setosa
1,,,,,Iris-setosa
2,,,,,Iris-setosa
3,,,,,Iris-setosa
4,,,,,Iris-setosa


In [36]:
# selecting  with loc and iloc
iris_df.loc[0]

Sepal_Length            5.1
Sepal_Width             3.5
Petal_length            1.4
Petal_Width             0.2
Species         Iris-setosa
Name: 0, dtype: object

In [37]:
# first row, only Petal_Length column
iris_df.loc[0,['Petal_length']]

Petal_length    1.4
Name: 0, dtype: object

In [38]:
# using iloc to get first row and first 2 columns
iris_df.iloc[0,0:2]

Sepal_Length    5.1
Sepal_Width     3.5
Name: 0, dtype: object

In [39]:
# row 10
iris_df.iloc[10]

Sepal_Length            5.4
Sepal_Width             3.7
Petal_length            1.5
Petal_Width             0.2
Species         Iris-setosa
Name: 10, dtype: object

In [40]:
iris_df.iloc[:5, :3]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4


In [41]:
# all rows, first 3 columns, filtering for Petal_length > 6
iris_df.iloc[:, :3][iris_df.Petal_length > 6]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length
105,7.6,3.0,6.6
107,7.3,2.9,6.3
109,7.2,3.6,6.1
117,7.7,3.8,6.7
118,7.7,2.6,6.9
122,7.7,2.8,6.7
130,7.4,2.8,6.1
131,7.9,3.8,6.4
135,7.7,3.0,6.1


###  Summary Statistics

In [45]:
iris_summary = iris_df.describe()

In [49]:
iris_summary

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [60]:
# summary stats for Sepal-Length variable
iris_summary.Sepal_Length

count    150.000000
mean       5.843333
std        0.828066
min        4.300000
25%        5.100000
50%        5.800000
75%        6.400000
max        7.900000
Name: Sepal_Length, dtype: float64

In [68]:
# Seeing if I am looking at observations in the data that have measurements greater than the mean
iris_df[iris_df.Sepal_Length > 5.84333].count()

Sepal_Length    70
Sepal_Width     70
Petal_length    70
Petal_Width     70
Species         70
dtype: int64

In [54]:
# See if there is a way to pull the mean for a specific variable such as Sepal_Length or Sepal_Width from the summary statistics 
# I could just calculate the mean directly and use this

In [85]:
iris_df.mean()

Sepal_Length    5.843333
Sepal_Width     3.054000
Petal_length    3.758667
Petal_Width     1.198667
dtype: float64

In [86]:
iris_df.Sepal_Length.mean()

5.843333333333335

In [87]:
# how many observations have Sepal Length greater than the average Sepal Length?
iris_df[iris_df.Sepal_Length > iris_df.Sepal_Length.mean()].count()

Sepal_Length    70
Sepal_Width     70
Petal_length    70
Petal_Width     70
Species         70
dtype: int64

In [89]:
# how many observations have Patal Length greater than the average Petal Length?
iris_df[iris_df.Petal_length > iris_df.Petal_length.mean()].count()

Sepal_Length    93
Sepal_Width     93
Petal_length    93
Petal_Width     93
Species         93
dtype: int64

### Correlation and Covariance

Correlation and covariance statistics are computed from pairs of arguments.

The DataFrame has a `corr` method and a `cov` methods thats returns a full correlation or covariance matrix as a DataFrame

In [90]:
iris_df.corr()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width
Sepal_Length,1.0,-0.109369,0.871754,0.817954
Sepal_Width,-0.109369,1.0,-0.420516,-0.356544
Petal_length,0.871754,-0.420516,1.0,0.962757
Petal_Width,0.817954,-0.356544,0.962757,1.0


In [91]:
iris_df.cov()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width
Sepal_Length,0.685694,-0.039268,1.273682,0.516904
Sepal_Width,-0.039268,0.188004,-0.321713,-0.117981
Petal_length,1.273682,-0.321713,3.113179,1.296387
Petal_Width,0.516904,-0.117981,1.296387,0.582414


### Data Transformation

In [97]:
# checking for duplicated rows.
# The DataFrame method `duplicated` returns a boolean series indicating whether each row is a duplicate or not. 
# Here I am using it to see if the exact same measurements have been observed for different observations.
# Below at index 34 and 37 have the same measurements for the Iris-setosa.
# Where is the other observation of the Iris-virginica though? row 142

iris_df[iris_df.duplicated()]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
34,4.9,3.1,1.5,0.1,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
142,5.8,2.7,5.1,1.9,Iris-virginica


In [99]:
iris_df[iris_df.duplicated()]

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_length,Petal_Width,Species
34,4.9,3.1,1.5,0.1,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
142,5.8,2.7,5.1,1.9,Iris-virginica


### Detecting and Filtering outliers

In [106]:
# Seeing if I am looking at observations in the data that have measurements greater than the mean
iris_df[iris_df.Sepal_Length > 5.84333].any(1)

50     True
51     True
52     True
54     True
56     True
58     True
61     True
62     True
63     True
65     True
68     True
70     True
71     True
72     True
73     True
74     True
75     True
76     True
77     True
78     True
83     True
85     True
86     True
87     True
91     True
97     True
100    True
102    True
103    True
104    True
       ... 
118    True
119    True
120    True
122    True
123    True
124    True
125    True
126    True
127    True
128    True
129    True
130    True
131    True
132    True
133    True
134    True
135    True
136    True
137    True
138    True
139    True
140    True
141    True
143    True
144    True
145    True
146    True
147    True
148    True
149    True
Length: 70, dtype: bool