# 1D Data - NumPy and Pandas

# <center> Pandas Series vs NumPy Arrays </center>

## NumPy Arrays

NumPy Arrays are somewhat similar to Python Lists. Below are some the similarities and difference of the two:

<img src = ../figures/sim_diff_table.png />

NumPy Arrays also contain features that can be of great help in analyzing data:
   - Vectorize Operations can be used on NumPy Arrays.
   - Index arrays could be used to access specific slices or elements of the NumPy Arrays.
   - Slices of NumPy arrays points to the original array. Hence any modification to a slice would reflect in the original array.
   

In [1]:
# Import NumPy and Pandas
import numpy as np
import pandas as pd

# Fig prefix
fig_prefix = '../figures/2015-09-08-as-numpy-pandas-lab-'

### Test Codes for NumPy Arrays

In [2]:
# Declaring a 1D NumPy Array
# Creating a list then passing that list as an argument for the np.array()

# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([
    97.35583,  104.62379,  103.02998,   95.14321,  103.69019,
    98.49185,  100.88828,   95.43974,   92.11484,   91.54804,
    95.98029,   98.22902,   96.12179,  119.28105,   97.84627,
    29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])

# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([
     95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,
     97.80458,  103.81398,   88.11736,   93.55611,   87.76347,
    102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,
     37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])

In [3]:
# Accessing elements
# Elements can be accessed individually thru indexing
# The first element can be accessed by index 0, and the last by index length-1

if False:                  # Change to True to run this block
    print countries[0]     # Prints the 1st element
    print countries[3]     # Prints the 4th element
    print countries[19]    # Prints the last element

In [4]:
# Slicing
# Range of elements can be accessed thru slicing
# Similar to Python's slicing, inclusive of the first index but exclusive of the last index

if False:                  # Change to True to run this block
    print countries[0:3]   # Prints the 1st to 3rd element
    print countries[:3]    # Prints the 1st to 3rd element
    print countries[17:]   # Prints the 18th to last element
    print countries[:]     # Prints the whole array

In [5]:
# Element types

if False:                                                          # Change to True to run this block
    #?countries.dtype                                              # Displays the docstring for dtype 

    print countries.dtype                                          # Array of Strings with a max string len of 22
    print employment.dtype                                         # Array of float64
    print np.array([0, 1, 2, 3]).dtype                             # Array of int32
    print np.array([1.0, 1.5, 2.0, 2.5]).dtype                     # Array of float64
    print np.array([True, False, True]).dtype                      # Array of boolean (bool)
    print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype           # Array of Strings with a max string len of 2

In [6]:
# Looping
# Traversing the array is almost the same as going through a Python list

if False:                                                         # Change to True to run the code
    for country in countries:
        print 'Examining country {}'.format(country)
        
if False:                                                         # Change to True to run the code
    for i in range(len(countries)):
        country = countries[i]
        country_employment = employment[i]
        print 'Country {} has employment {}'.format(country,
                country_employment)        

In [7]:
# Numpy functions
# Some of Numpy functions

if False:                       # Change to True to run this block
    print employment.mean()     # Computes the mean in the employment array
    print employment.std()      # Computes the standard dev in the employment array 
    print employment.max()      # Finds the maximum value in the employment array
    print employment.sum()      # Computes the sum of all values in the employment array

In [8]:
# Finding the maximum value for the employment data and returning with country

#?np.argmax()                                         # Displays the docstring of np.argmax() function

def max_employment(countries, employments):
    '''
    Returns a tuple that contains the maximum employment with its corresponding country.
    
    Function arguments:
    countries   -- numpy array of countries
    employments -- numpy array of employments of comparable type
    '''
    
    max_index = np.argmax(employment)                 # Returns the index  of the maximum value in the array
    return (countries[max_index], employments[max_index])

In [9]:
# Print the max employment together with its country
print max_employment(countries, employment)

('Angola', 75.699996949999999)


### Vectorized Operations

NumPy arrays can do vector operations such as vector addition, subtraction, scalar multiplication, etc.

In [10]:
# Arithmetic operations between 2 NumPy arrays
if False:                                # Change to True to run this block
    a = np.array([1, 2, 3, 4, 5])
    b = np.array([1, 2, 1, 2, 2])

    print a + b
    print a - b
    print a * b
    print a / b
    print a ** b

# Arithmetic operations between a NumPy array and a single number
if False:                               # Change to True to run this block
    a = np.array([1, 2, 3, 4])
    b = 2

    print a + b
    print a - b
    print a * b
    print a / b
    print a ** b

In [11]:
# Logical operations with NumPy Arrays
if False:                               # Change to True to run this block
    a = np.array([True, True, False, False])
    b = np.array([True, False, True, False])
    c = np.array([0, 1, 2, True])
    
    print a & b
    print a | b
    print ~a
    
    print a & True
    print a & False
    
    print a | True
    print a | False
    
    print c.dtype
    print a | c

In [12]:
# Comparison operations between 2 NumPy Arrays
if False:                              # Change to True to run this block
    a = np.array([1, 2, 3, 4, 5])
    b = np.array([5, 4, 3, 2, 1])
    
    print a > b
    print a >= b
    print a < b
    print a <= b
    print a == b
    print a != b
    
# Comparison operations between a NumPy array and a single number
if False:                             # Change to True to run this block
    a = np.array([1, 2, 3, 4])
    b = 2
    
    print a > b
    print a >= b
    print a < b
    print a <= b
    print a == b
    print a != b

### Computing the overall completion rate for each country

In [13]:
# Computing the overall completion rate (i.e female and male completion rates combined) for every country

def overall_completion_rate(female_completion, male_completion):
    '''
    Returns a numpy array that contains the overall completion rate for each country.
    
    Assumes that the half of the population is male and half is female
    
    Keyword arguments:
    female_completion -- the numpy array of female completion rates
    male_completion   -- the numpy array of male completion rates
    '''
    
    return (female_completion + male_completion)/2.0        # Divide by a floating number in case rates are integers

In [14]:
# Compute the overall completion rate
completion_rate = overall_completion_rate(female_completion, male_completion)
print completion_rate

[  96.416025  102.644275  101.41129    93.316285  103.455575   98.148215
  102.35113    91.77855    92.835475   89.655755   99.218715   98.484275
   94.172835  117.335125   98.275645   33.04039    41.905225   90.962965
   57.08404    93.06015 ]


### Standardizing the data

In [15]:
# Standardizing or normalizing the employment rates for the given 20 countries
def standardize_data(values):
    '''
    Returns an array of standardized values.
    
    Function Argument:
    values -- numpy array that contains the values
    '''
    values_mean = np.mean(values)                               # Computes the mean thru numpy
    values_std = np.std(values)                                 # Computes the standard deviation thru numpy
    standardize_array = (values - values_mean)/values_std       # Computes the standardized score (z-score) for each values
    
    return standardize_array  

In [16]:
# Standardize the employment rates for the countries
employment_standardize = standardize_data(employment)
print employment_standardize

[-0.31965231 -0.780123   -0.87650077  1.82207181 -0.03051941 -1.99019768
  0.30144772 -0.16973184  0.23719615  0.84758731  0.18365304  1.00821665
  0.87971351 -0.56595055 -1.07996476 -0.20185762  1.38301845 -0.03051941
  1.2545153  -1.87240259]


### Index Arrays

In [17]:
# Using index arrays
if False:                                               # Change to True to display this block
    a = np.array([1, 2, 3, 4])
    b = np.array([True, True, False, False])
    
    print a[b]                                          # Only prints the values of a whose index is True in b
    print a[np.array([True, False, True, False])]       # Same as above, just skips the assignment of array to variable b

### Vectorized Operations

In [18]:
# Creating the index array using vectorized operations
if False:                                              # Change to True to display this block
    a = np.array([1, 2, 3, 2, 1])                      
    b = (a >= 2)                                       # Vector operations with comparison operator
    
    print b
    print a[b]
    print a[a >= 2]                                    # Only prints values of a that is greater than or equal to 2

In [19]:
# Creating the index array using vectorized operations on another array
if False:                                              # Change to True to display this block
    a = np.array([1, 2, 3, 4, 5])
    b = np.array([1, 2, 3, 2, 1])
    
    print b == 2
    print a[b == 2]

In [20]:
# Display the countries with male or female completion rate greater than or equal 100
print countries[((male_completion>=100) | (female_completion>=100))]

['Albania' 'Algeria' 'Argentina' 'Australia' 'Bahrain' 'Belarus']


### Modifying Slices

In [21]:
# Modifying the original array through slices
if False:                                               # Change to True to display this block
    a = np.array([1, 2, 3, 4, 5])
    b = a[2:]                                           # Assign a slice to a variable b
    b[:] = 0                                            # Modify b
    
    print a                                             # Modifications reflects in a
    
# Modifying an array with Not-in-place modification   
if False:                                               # Change to True to display this block
    a = np.array([1, 2, 3, 4, 5])
    b = a[2:]                                          
    b = b * 2                                           # Assigns b to new array thus not pointing to a anymore
   
    print b
    print a
    
# Modifying an array with In-place modification
if False:                                               # Change to True to display this block   
    a = np.array([1, 2, 3, 4, 5])
    b = a[2:]                                          
    b *= 2                                              # Carries the operation in-place thus reflecting it in a
    
    print a

## Pandas Series

Similar to NumPy Array, but with extra functionality. 

Some similarities of Pandas Series with NumPy Arrays:
   - Accessing elements through indexing or slicing
   - Looping or traversing through the series
   - Convenient functions such as s.mean(), s.max(), etc.
   - Vectorized Operations
   - Like NumPy Arrays, is implemented in C thus much faster.
   
Some advantage of the Pandas Series are:
   - Extra functionality or functions (**i.e.** s.describe() - displays some statistics about the series)
   - Enables the use of series indexes
   - Values are matched up through index rather than position


### Test Codes for Pandas Series

In [22]:
# Python Lists
# 20 countries
countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
             'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
             'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
             'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']

# Life expectancy in 2007 for those 20 countries
life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

# GDP in 2007 for those 20 countries
gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]

In [23]:
# Declaring a 1D Pandas Series
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)

In [24]:
# Accessing elements thru indexing and slicing
if False:                                     # Change to True to run this block
    print life_expectancy[0]                  # Prints the 1st element
    print gdp[3:6]                            # Prints the 4th to 6th element

In [25]:
# Looping
if False:                                     # Change to True to run this block
    for country_life_expectancy in life_expectancy:
        print 'Examining life expectancy {}'.format(country_life_expectancy)

In [26]:
# Pandas functions
if False:                                     # Change to True to run this block
    print life_expectancy.mean()              # Prints the mean of the data
    print life_expectancy.std()               # Prints the standard deviation of the data
    print gdp.max()                           # Prints the maximum value in the data
    print gdp.sum()                           # Prints the sum of value of the data

In [27]:
# Vectorized operations and index arrays
if False:                                     # Change to True to run this block
    a = pd.Series([1, 2, 3, 4])
    b = pd.Series([1, 2, 1, 2])
  
    print a + b                               # Series addition
    print a * b                               # Series multiplication to another series
    print a * 2                               # Series multiplication to single number
    print a >= 3                              # Using comparision operator on series
    print a[a >= 3]                           # Using index arrays to print values greater or equal to 3

In [28]:
# Function that determines if two variables are correlated just by position of values from the mean
# Positive Correlation if the first element of the return value tuple is significantly larger than the second
# Negative Correlation if the second element of the return value tuple is significantly larger than the first
# No Correlation if the return values of the tuple are roughly equal
def variable_correlation(variable1, variable2):
    '''
    Calculates the number of data points for which
    the directions of variable1 and variable2 relative to the mean are the
    same, and the number of data points for which they are different.
    
    Returns a tuple (num_same_direction, num_different_direction).
    '''
    
    # Standardize each data 
    standardize_variable1 = standardize_data(variable1)        # Standardize values of variable 1
    standardize_variable2 = standardize_data(variable2)        # Standardize values of variable 2
    
    # Now since the data has been standardized, positive values are above the mean while negative values are below
    # Multiply the series to 1 to convert it to int values 0-False, 1-True
    var1_bool = (standardize_variable1>=0)*1                   # Boolean series that is True if z-score is positive for var1
    var2_bool = (standardize_variable2>=0)*1                   # Boolean series that is True if z-score is positive for var2
    sum_bool = (var1_bool + var2_bool)
    
    # Values both below the mean would sum to 0, while values both above the mean would sum to 2
    num_same_direction = len(sum_bool[sum_bool==0]) + len(sum_bool[sum_bool==2])
                             
    # Values with one variable above the mean and the other variable below the mean would sum to 1                      
    num_different_direction = len(sum_bool[sum_bool==1])
    
    return (num_same_direction, num_different_direction)

In [29]:
# Determining the correlation between a country's gdp and life expectancy
variable_correlation(gdp, life_expectancy)

(17, 3)

### Series Index

In [48]:
# Declaring a Pandas Series with a series index

# Re-assigning the life_expectancy series to a new series with coutries as index
employment = pd.Series(employment, index=countries) 
print employment

Albania                55.700001
Algeria                51.400002
Andorra                50.500000
Angola                 75.699997
Antigua and Barbuda    58.400002
Argentina              40.099998
Armenia                61.500000
Australia              57.099998
Austria                60.900002
Azerbaijan             66.599998
Bahamas                60.400002
Bahrain                68.099998
Bangladesh             66.900002
Barbados               53.400002
Belarus                48.599998
Belgium                56.799999
Belize                 71.599998
Benin                  58.400002
Bhutan                 70.400002
Bolivia                41.200001
dtype: float64


In [49]:
# Before we could access a series thru indexing
print employment[0]                          # Right terminology is: 'Access the life expectancy at position 0'

# New approach to access series
# Using Pandas .loc() function
print employment.loc['Bhutan']               # Access the life expectancy for index 'Bhutan'

# Using Pandas .iloc() function
print employment.iloc[0]                     # Access the life expectancy by the given position

55.70000076
70.40000153
55.70000076


In [58]:
# Finding the maximum value in the employment data
#?employment.argmax                          # Displays the docstring for pd.argmax()
def max_employment(employment):
    '''
    Returns the maxium employment with the corresponding country in tuple (max_country, employment.loc[max_country])
    
    Return argument:
    max_country -- country with the maximum employment rate.
    
    Function argument:
    employment -- panda series with countries as index
    '''
    
    max_country = employment.argmax()
    
    return (max_country, employment.loc[max_country])

In [66]:
# Addition when indexes are the same
if False:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
    print s1 + s2

# Indexes have same elements in a different order
if False:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])
    print s1 + s2

# Indexes overlap, but do not have exactly the same elements
if False:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
    print s1 + s2

# Indexes do not overlap
if False:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
    print s1 + s2
    
# Hence operations on Pandas Series are by index not by position