### Creating a Pandas Dataframe

* [From a list of tuples](#create-first)
* [From a dictionary](#create-second)
* [Loading a CSV file](#create-third)
* [Built-in datasets](#create-fourth)


In [1]:
# This is the customary way of importing pandas
import pandas as pd


#### (a) From a list of tuples <a class="anchor" id="create-first"></a>

In [29]:
name = ['Bob','Jessica','Mary','John','Mel']
age = [16, 35, 77, 57, 23]

people = list ( zip(name,age))
people

[('Bob', 16), ('Jessica', 35), ('Mary', 77), ('John', 57), ('Mel', 23)]

In [38]:
df = pd.DataFrame(data=people, columns=['Name','Age'])

In [31]:
df

Unnamed: 0,Name,Age
0,Bob,16
1,Jessica,35
2,Mary,77
3,John,57
4,Mel,23


#### (b) From a dictionary <a class="anchor" id="create-second"></a>

In [26]:
population_dict = { 'Country': [ 'China', 'India', 'United States', 'Indonesia' ],
                    'Population' : [1415045928, 1354051854, 326766748, 266794980] }

for k,v in population_dict.items():
    print (k,v)

Country ['China', 'India', 'United States', 'Indonesia']
Population [1415045928, 1354051854, 326766748, 266794980]


In [27]:
df = pd.DataFrame(population_dict)

In [28]:
df.head()

Unnamed: 0,Country,Population
0,China,1415045928
1,India,1354051854
2,United States,326766748
3,Indonesia,266794980


#### (c) From a CSV file <a class="anchor" id="create-third"></a>

In [45]:
# The option sep="," is used to indicate field separators
df = pd.read_csv('misc/population.csv',sep=",") # The file name can be replaced with a URL
df.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1960,92490932.0
1,Arab World,ARB,1961,95044497.0
2,Arab World,ARB,1962,97682294.0
3,Arab World,ARB,1963,100411076.0
4,Arab World,ARB,1964,103239902.0


#### (d) Built-in datasets <a class="anchor" id="create-fourth"></a>

**In-class exercise** Packages like sklearn and seaborn come with practice datasets. Create a pandas dataframe from iris dataset below.

In [46]:
from sklearn.datasets import load_iris
import pandas as pd

In [47]:
iris = load_iris()

In [48]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [49]:
type(iris.data)

numpy.ndarray

### Meta-data about the dataframe

In [51]:
# The number of rows and columns
df.shape

(14885, 4)

In [52]:
df.columns

Index(['Country Name', 'Country Code', 'Year', 'Value'], dtype='object')

In [53]:
# This command is useful to see the datatypes of columns and also check if there there are any NULL objects.
# This dataframe has none.  
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14885 entries, 0 to 14884
Data columns (total 4 columns):
Country Name    14885 non-null object
Country Code    14885 non-null object
Year            14885 non-null int64
Value           14885 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 465.2+ KB


In [54]:
# Summary of numerical solumns
df.describe()

Unnamed: 0,Year,Value
count,14885.0,14885.0
mean,1988.068928,204797200.0
std,16.456023,675923500.0
min,1960.0,4279.0
25%,1974.0,912417.0
50%,1988.0,6299909.0
75%,2002.0,41773000.0
max,2016.0,7442136000.0


In [55]:
# How are the rows indexed? By default, pandas enumerates the rows when a csv file is loaded.
df.index

RangeIndex(start=0, stop=14885, step=1)

### Accessing rows and columns


In [56]:
# See the first few rows
df.head(3)

# Similar commands: 
# df.tail()
# df.sample()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1960,92490932.0
1,Arab World,ARB,1961,95044497.0
2,Arab World,ARB,1962,97682294.0


**In-class exercise.** Randomly sample 20 distinct countries from the list of countries in the table.

In [63]:
import numpy as np
df.describe(include=np.object)

Unnamed: 0,Country Name,Country Code
count,14885,14885
unique,263,263
top,Ghana,FRO
freq,57,57


In [57]:
# Select columns by their names
df[ ['Country Name', 'Value'] ].head() # Notice the double [[]].

Unnamed: 0,Country Name,Value
0,Arab World,92490932.0
1,Arab World,95044497.0
2,Arab World,97682294.0
3,Arab World,100411076.0
4,Arab World,103239902.0


In [None]:
# Get rows by their indices. This is similar slicing in lists. iloc means "integer location"

df.iloc[0:4]


In [None]:
# The general for of iloc is df.iloc[ row_indexer , col_indexer].

df.iloc[ 0:4, [1,2] ]

In [None]:
# iloc does not support label-based access. In this case we must drop use loc.
# df.iloc[ 0:1, ['Year','Value']  ] <-----Invalid


#### Set the index

In [None]:
df.set_index(keys=['Country Name'],inplace=True)

In [None]:
df.index

In [None]:
# loc is used of label-based access. Format: df.loc[ labeled_row_indexer, labeled_col_indexer]
# Both row-indexer and column indexer have to be label and cannot have numbers.

df.loc[ 'India':'Indonesia',  ['Value'] ].sample(5)


*Remark* Another way to access elements is using df.ix. We will not discuss it as it will become deprecated in the upcoming Pandas version.

#### Boolean indexing

In [2]:
df = pd.read_csv('misc/population.csv')

In [3]:
select_condition_1 = df['Value']>1000000000
# select_condition1 is now a boolean mask

In [4]:
df[select_condition_1].sample(3)

Unnamed: 0,Country Name,Country Code,Year,Value
250,East Asia & Pacific,EAS,1982,1606486000.0
1467,Low & middle income,LMY,2002,5192002000.0
259,East Asia & Pacific,EAS,1991,1845102000.0


In [5]:
selection_condition_2 = df['Country Name'] == 'China'

In [6]:
# We select rows that satisfy both conditions. 
# Boolean index can be made from a combination of 
# logical operators of AND &, OR |, NOT ~.

df[select_condition_1 & selection_condition_2 ].sample(5)


Unnamed: 0,Country Name,Country Code,Year,Value
4993,China,CHN,1994,1191835000.0
4988,China,CHN,1989,1118650000.0
5008,China,CHN,2009,1331260000.0
5000,China,CHN,2001,1271850000.0
5002,China,CHN,2003,1288400000.0


In [7]:
# isin is a useful operator when building a boolean index
selection_condition_3 = df['Country Name'].isin( ['China','India'] )
df[selection_condition_3].sample(5)

Unnamed: 0,Country Name,Country Code,Year,Value
7669,India,IND,1996,978893200.0
7660,India,IND,1987,816792700.0
4996,China,CHN,1997,1230075000.0
7649,India,IND,1976,635771700.0
7661,India,IND,1988,834489300.0


In [8]:
# where is useful when we want to retain the shape of the original table.
# The values that dont match the selection critieria are set to NaN
df.where(df['Year']>2011).shape


(14885, 4)

**In-class exercise**. Retreive all the countries whose GDP is more than three standard deviations from the average.

In [16]:
df.columns

Index(['Country Name', 'Country Code', 'Year', 'Value'], dtype='object')

In [96]:
df[abs(df.Value-df.Value.mean())>3*df.Value.std()]['Country Name'].shape





(342,)

In [25]:
df.dtypes

Country Name     object
Country Code     object
Year              int64
Value           float64
dtype: object

### Modifying data

* [Adding rows](#modify-first)
* [Adding columns](#modify-second)
* [Sorting](#modify-third)

#### Adding rows <a class="anchor" id="modify-first"></a>

In [106]:
# We use a dataset containing two sets of marks for a few students

import pandas as pd

df = pd.read_csv('misc/studentmarks2.csv', sep=",", header=None)

In [107]:
df.columns = ['Name', 'Marks1', 'Marks2']

In [108]:
df

Unnamed: 0,Name,Marks1,Marks2
0,priya,25,25
1,sandesh,20,45
2,adil,30,30
3,ranjan,40,25
4,shubha,20,15
5,james,15,34
6,himanshu,20,20
7,aryan,37,20
8,soumya,40,15
9,vikram,45,30


In [109]:
df.columns

Index(['Name', 'Marks1', 'Marks2'], dtype='object')

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
Name      11 non-null object
Marks1    11 non-null int64
Marks2    11 non-null int64
dtypes: int64(2), object(1)
memory usage: 344.0+ bytes


In [111]:
# Add two students to the existing data. We first make a dataframe out of the new data and then append
# to the old dataframe

new_data = pd.DataFrame( [ ['ram',30,40],['sana',25,60] ], columns=['Name','Marks1','Marks2'] )
df.append(new_data)


Unnamed: 0,Name,Marks1,Marks2
0,priya,25,25
1,sandesh,20,45
2,adil,30,30
3,ranjan,40,25
4,shubha,20,15
5,james,15,34
6,himanshu,20,20
7,aryan,37,20
8,soumya,40,15
9,vikram,45,30


In [112]:
# Notice that there is a common index for two students. There are two ways to avoid this situation:
# Either use ignore_index=True option when appending or reset the index as shown below
df.iloc[1]

Name      sandesh
Marks1         20
Marks2         45
Name: 1, dtype: object

In [113]:
df.reset_index # This command remove the newly added rows

# We append the row the right way by choosing to ignore the index. 
# This method does not change df.

df.append(new_data,ignore_index=True)

Unnamed: 0,Name,Marks1,Marks2
0,priya,25,25
1,sandesh,20,45
2,adil,30,30
3,ranjan,40,25
4,shubha,20,15
5,james,15,34
6,himanshu,20,20
7,aryan,37,20
8,soumya,40,15
9,vikram,45,30


#### Adding columns <a class="anchor" id="modify-second"></a>

In [114]:
#Add a column explicitly

df['Grade'] = ['Fourth','Fourth','Third',"Third","Third","Second","Second","Second","Third","Second","Second" ]


In [115]:
df.shape

(11, 4)

In [116]:
df

Unnamed: 0,Name,Marks1,Marks2,Grade
0,priya,25,25,Fourth
1,sandesh,20,45,Fourth
2,adil,30,30,Third
3,ranjan,40,25,Third
4,shubha,20,15,Third
5,james,15,34,Second
6,himanshu,20,20,Second
7,aryan,37,20,Second
8,soumya,40,15,Third
9,vikram,45,30,Second


In [117]:
# Adding a derived column

df['Total'] = df['Marks1'] + df['Marks2']

In [118]:
df

Unnamed: 0,Name,Marks1,Marks2,Grade,Total
0,priya,25,25,Fourth,50
1,sandesh,20,45,Fourth,65
2,adil,30,30,Third,60
3,ranjan,40,25,Third,65
4,shubha,20,15,Third,35
5,james,15,34,Second,49
6,himanshu,20,20,Second,40
7,aryan,37,20,Second,57
8,soumya,40,15,Third,55
9,vikram,45,30,Second,75


#### Rearranging columns

In [119]:
df = df[ ['Name','Grade','Marks1','Marks2','Total'] ]

#### Sorting <a class="anchor" id="modify-third"></a>

In [124]:
# Sorting based on a list of columns is easy. This however does not modify the dataframe.
# In order to modify the table use the option inplace=True.

df.sort_values(by=['Total','Marks1'],ascending=[False,True],inplace=True)


In [132]:
df.reset_index(inplace=True)

In [133]:
df.head()

Unnamed: 0,index,Name,Grade,Marks1,Marks2,Total
0,9,vikram,Second,45,30,75
1,1,sandesh,Fourth,20,45,65
2,3,ranjan,Third,40,25,65
3,2,adil,Third,30,30,60
4,10,asha,Second,30,30,60


In [134]:
del df['index']
df.head()

Unnamed: 0,Name,Grade,Marks1,Marks2,Total
0,vikram,Second,45,30,75
1,sandesh,Fourth,20,45,65
2,ranjan,Third,40,25,65
3,adil,Third,30,30,60
4,asha,Second,30,30,60


In [135]:
df.drop(0,inplace=True)
df.head()

Unnamed: 0,Name,Grade,Marks1,Marks2,Total
1,sandesh,Fourth,20,45,65
2,ranjan,Third,40,25,65
3,adil,Third,30,30,60
4,asha,Second,30,30,60
5,aryan,Second,37,20,57
