# Practice 4 - Pandas

In [159]:
import numpy as np
import pandas as pd

A **Series** is a pandas dataframe that incorperates aspects from dictionaries and NumPy arrays to allow for lookups, and data alignment based operations

In [160]:
# Series from dictionary
countries = {'USA': 331002651, 'India': 1380004385, 'China': 1439323776, 'Brazil': 212559417}
dict_series = pd.Series(countries)
print("Series from dictionary:\n", dict_series)

Series from dictionary:
 USA        331002651
India     1380004385
China     1439323776
Brazil     212559417
dtype: int64


In [161]:
# Series from two lists
evolution = ['Natural Selection', 'Mutation', 'Genetic Drift', 'Gene Flow']
years = [1859, 1900, 1930, 1950]
list_series = pd.Series(years,evolution)
print(f"Series from two lists:\n{list_series}")

Series from two lists:
Natural Selection    1859
Mutation             1900
Genetic Drift        1930
Gene Flow            1950
dtype: int64


In [162]:
# Element-wise operations
growth_rates = pd.Series([0.7, 1.0, 0.5, 0.9], ['USA', 'India', 'China', 'Brazil'])
projected_pop = dict_series * (1 + growth_rates)
print("Projected populations after one year:\n", projected_pop)

Projected populations after one year:
 USA       5.627045e+08
India     2.760009e+09
China     2.158986e+09
Brazil    4.038629e+08
dtype: float64


In [163]:
# Mismatching indices
more_rates = pd.concat([growth_rates, pd.Series([0.8], ['Russia'])])
projected_pop_mismatch = dict_series * (1 + more_rates)
print("Projected populations with mismatched indices:\n", projected_pop_mismatch)

Projected populations with mismatched indices:
 Brazil    4.038629e+08
China     2.158986e+09
India     2.760009e+09
Russia             NaN
USA       5.627045e+08
dtype: float64


In [164]:
# Testing gemini autocompletion
additional_data = pd.Series([100, 200, 300], index=['A', 'B', 'C'])
print("Additional data series:\n", additional_data)

Additional data series:
 A    100
B    200
C    300
dtype: int64


In [165]:
# Without specification of index column
country_list = pd.Series(['US', 'UK', 'Canada', 'Australia', 'Lebanon'])
print(country_list)
print('Automatic indexing starts at 0 and increments by 1 for each value')
for idx, val in country_list.items():
    print("{0}\t{1}".format(idx,val))

0           US
1           UK
2       Canada
3    Australia
4      Lebanon
dtype: object
Automatic indexing starts at 0 and increments by 1 for each value
0	US
1	UK
2	Canada
3	Australia
4	Lebanon


In [166]:
# Indexing
print("By label:")
print(dict_series['India'])
print('By position:')
print(dict_series.iloc[1])

By label:
1380004385
By position:
1380004385


In [167]:
# Adding and removing elements
dict_series['Russia'] = 145934462
print("After adding Russia:")
for idx, val in dict_series.items():
    print("{0}\t{1}".format(idx,val))
print("")

dict_series = dict_series.drop('China')
print("After removing China:")
for idx, val in dict_series.items():
    print("{0}\t{1}".format(idx,format(val,".2E")))

# Population in millions
print()
pop_millions = projected_pop_mismatch / 1e6

for country, pop in pop_millions.items():
    if pop >= 1000:
        print("{0}\t{1:.2f} billion".format(country, pop/1000))
    elif pop == 'NaN':
        print('Missing data')
    else:
        print("{0}\t{1:.2f} million".format(country, pop))

After adding Russia:
USA	331002651
India	1380004385
China	1439323776
Brazil	212559417
Russia	145934462

After removing China:
USA	3.31E+08
India	1.38E+09
Brazil	2.13E+08
Russia	1.46E+08

Brazil	403.86 million
China	2.16 billion
India	2.76 billion
Russia	nan million
USA	562.70 million


In [168]:
# Adding a value
pop_millions['Canada'] = 38.01
print(pop_millions)

Brazil     403.862892
China     2158.985664
India     2760.008770
Russia            NaN
USA        562.704507
Canada      38.010000
dtype: float64


In [169]:
# Boolean operators
over_billion = pop_millions > 1000
print(over_billion)

Brazil    False
China      True
India      True
Russia    False
USA       False
Canada    False
dtype: bool


In [170]:
# Mismatched indices
series1 = pd.Series([1, 2, 3, 4], ['London', 'HongKong', 'Shanghai', 'Shenzhen'])
series2 = pd.Series([5, 6, 7, 8], ['London', 'Shenzhen', 'NewYork', 'Delhi'])
power = series1 ** series2
print(power)

Delhi          NaN
HongKong       NaN
London         1.0
NewYork        NaN
Shanghai       NaN
Shenzhen    4096.0
dtype: float64


In [171]:
# Series comprehensions
square_primes = pd.Series([x**2 for x in [2, 3, 5, 7, 11]], index=['two', 'three', 'five', 'seven', 'eleven'])
print(square_primes)
print()
print('Reversed order:')
print(square_primes[::-1])

two         4
three       9
five       25
seven      49
eleven    121
dtype: int64

Reversed order:
eleven    121
seven      49
five       25
three       9
two         4
dtype: int64


In [173]:
# In method
print('India' in power)
print('Delhi' in power)

False
True


## Series methods

In [None]:
# Check for mismatches (null values)