In [2]:
import pandas as pd

In [3]:
# Reading in the csv
# A dataframe is a 2-dimensional data structure (data represented in rows and columns)

df = pd.read_csv('./datasets/iris.csv')
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [4]:
# check the number of rows and columns
df.shape

(150, 5)

In [5]:
# check the datatypes of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  149 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       149 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
# note that the rows are truncated
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [7]:
# we can set the maximum number of rows we want to see
pd.set_option('display.max_rows', 20)

In [8]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [9]:
# creating a dataframe from a dictionary
# A dataframe can be thought of as a dictionary but with much more additional functionality
employees = {
    'name': ['Sally', 'Micheal', 'Jim'],
    'age': [24, 26, 32],
    'job': ['ATRM', 'Regional Manager', 'Salesman']
}

In [10]:
employees['age']

[24, 26, 32]

In [11]:
employees_df = pd.DataFrame(employees)
employees_df

Unnamed: 0,name,age,job
0,Sally,24,ATRM
1,Micheal,26,Regional Manager
2,Jim,32,Salesman


In [12]:
# Following is the object of series. (series of rows of a single column)
employees_df['name']

0      Sally
1    Micheal
2        Jim
Name: name, dtype: object

In [13]:
# Accessing multiple columns. We simply pass in a LIST of column names
employees_df[['name', 'job']]

Unnamed: 0,name,job
0,Sally,ATRM
1,Micheal,Regional Manager
2,Jim,Salesman


In [14]:
# see all the column names
df.columns

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width',
       'variety'],
      dtype='object')

In [15]:
df.columns[0]

'sepal.length'

In [16]:
# in order to select rows we can use 'iloc' and 'loc'

# ILOC allows us to access rows by integer valued locations
df.iloc[0]

sepal.length       NaN
sepal.width        3.5
petal.length       1.4
petal.width        0.2
variety         Setosa
Name: 0, dtype: object

In [17]:
# Accessing multiple rows. We simply pass in a LIST of integer valued locations
df.iloc[[0, 1]]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa


In [18]:
# Setting manual indices for rows
employees_df.index = ['S', 'M', 'J']
employees_df

Unnamed: 0,name,age,job
S,Sally,24,ATRM
M,Micheal,26,Regional Manager
J,Jim,32,Salesman


In [20]:
# Accessing the row through manual indices
employees_df.loc['M']

name             Micheal
age                   26
job     Regional Manager
Name: M, dtype: object

In [23]:
# Pre-defined lists
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]

# Create dictionary my_dict with three key:value pairs: my_dict
my_dict = {
    'country': names,
    'drives_right': dr,
    'cars_per_cap': cpc
}

# Build a DataFrame cars from my_dict: cars
cars = pd.DataFrame(my_dict)

# Print cars
print(cars)

         country  drives_right  cars_per_cap
0  United States          True           809
1      Australia         False           731
2          Japan         False           588
3          India         False            18
4         Russia          True           200
5        Morocco          True            70
6          Egypt          True            45


In [24]:
# Slicing in a dataframe
cars[1:4]

Unnamed: 0,country,drives_right,cars_per_cap
1,Australia,False,731
2,Japan,False,588
3,India,False,18
