pandas module for analysising data

In [3]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [4]:
#Lets create a Series (array of data and data labels, its index)

obj = Series([3,6,9,12])

#Show
obj

0     3
1     6
2     9
3    12
dtype: int64

In [6]:
#Lets show the values
obj.values

array([ 3,  6,  9, 12], dtype=int64)

In [7]:
#Lets show the index
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
#Now lets create a Series with an index

#WW2 casualties 
ww2_cas = Series([8700000,4300000,3000000,2100000,400000], index=['USSR','Germany','China','Japan','USA'])

In [9]:
ww2_cas

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [10]:
#use of index values to select series values
ww2_cas['USA']

400000

In [12]:
# check which countries had cas greater 4million
for x in ww2_cas > 4000000:
    print(x)
    

True
True
False
False
False


In [13]:
#Can also check with array operations

#Check who had casualties greater than 4 million
ww2_cas[ww2_cas > 4000000]

USSR       8700000
Germany    4300000
dtype: int64

In [14]:
#Can treat Series as ordered dictionary

#Check if USA is in Series
'USA' in ww2_cas

True

In [15]:
#Can convert Series into Python dictionary
ww2_dict = ww2_cas.to_dict()

#Show
ww2_dict


{'USSR': 8700000,
 'Germany': 4300000,
 'China': 3000000,
 'Japan': 2100000,
 'USA': 400000}

In [16]:
#Can convert back into a Series
ww2_series = Series(ww2_dict)
ww2_series

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [17]:
#Passing a dictionary the index will have the dict keys in order
countries = ['China','Germany','Japan','USA','USSR','Argentina']

In [19]:
#Lets redefine a Series
obj2 = Series(ww2_dict,index=countries)
obj2

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
dtype: float64

In [20]:
#We can use isnull and notnull to find missing data
pd.isnull(obj2)

#obj2.isnull() 

China        False
Germany      False
Japan        False
USA          False
USSR         False
Argentina     True
dtype: bool

In [21]:
#Same for the opposite
pd.notnull(obj2)

#obj2.notnull()

China         True
Germany       True
Japan         True
USA           True
USSR          True
Argentina    False
dtype: bool

In [22]:
#Lets see the ww2 Series again
ww2_series

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [23]:
#Lets check our Series with Argentine again
obj2

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
dtype: float64

In [24]:
#Now we can add and pandas automatically aligns data by index
ww2_series + obj2

Argentina           NaN
China         6000000.0
Germany       8600000.0
Japan         4200000.0
USA            800000.0
USSR         17400000.0
dtype: float64

In [27]:
#We can give Series names
obj2.name = "World War 2 Casualties"
obj2

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
Name: World War 2 Casualties, dtype: float64

In [29]:
#We can also name index
obj2.index.name = 'Countries'
obj2

Countries
China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
Name: World War 2 Casualties, dtype: float64

In [None]:
#Next we'll learn DataFrames!
