## Introduction to Dataset

In [5]:
import numpy as np
world_alcohol = np.genfromtxt('world_alcohol.csv', delimiter = ',')
world_alcohol_type = type(world_alcohol)
print(world_alcohol[0:2])
print (world_alcohol_type)
# ⚠️ 'nan' is 'not a number'
# ⚠️ ‘na' is 'not available'
# ⚠️ numpy array has to contain the same type
# ⚠️ remember the method of NumPy opening file 'genfromtxt' & 'delimiter'

[[   nan    nan    nan    nan    nan]
 [ 1986.    nan    nan    nan     0.]]
<class 'numpy.ndarray'>


In [8]:
world_alcohol = np.genfromtxt('world_alcohol.csv',delimiter = ',', dtype = 'U75', skip_header = 0)
print(world_alcohol[0:2])
# ⚠️ numpy array tries to recognize the data type on its own, however, we could specify the data typ
# ⚠️ with 'dtype' parameter, we could customize the data type
# ⚠️ skip_header: The number of lines to skip at the beginning of the file.

[['Year' 'WHO region' 'Country' 'Beverage Types' 'Display Value']
 ['1986' 'Western Pacific' 'Viet Nam' 'Wine' '0']]


## Indexing Arrays

In [12]:
# Python Lists Indexing Seperately
list_of_lists = [
        [5, 10, 15], 
        [20, 25, 30]
       ]
first_item = list_of_lists[0]
first_item[2]

# which equals to 
list_of_lists[0][2]

# Numpy Array Indexing (both indices inside square brackets)
matrix = np.array([
                        [5, 10, 15], 
                        [20, 25, 30]
                     ])
matrix[1,2]

30

## Slicing Arrays

In [16]:
# ⚠️slice both dimensions
# the whole fifth column from world_alcohol
world_alcohol [:,4]
# all the rows and the first 2 columns of world_alcohol
world_alcohol[:,0:2]
# the first 10 rows and the first column
world_alcohol[0:10, 0]
#  the first 10 rows and all of the columns of world_alcohol
world_alcohol[0:10, :]

array([['Year', 'WHO region', 'Country', 'Beverage Types', 'Display Value'],
       ['1986', 'Western Pacific', 'Viet Nam', 'Wine', '0'],
       ['1986', 'Americas', 'Uruguay', 'Other', '0.5'],
       ['1985', 'Africa', "Cte d'Ivoire", 'Wine', '1.62'],
       ['1986', 'Americas', 'Colombia', 'Beer', '4.27'],
       ['1987', 'Americas', 'Saint Kitts and Nevis', 'Beer', '1.98'],
       ['1987', 'Americas', 'Guatemala', 'Other', '0'],
       ['1987', 'Africa', 'Mauritius', 'Wine', '0.13'],
       ['1985', 'Africa', 'Angola', 'Spirits', '0.39'],
       ['1986', 'Americas', 'Antigua and Barbuda', 'Spirits', '1.55']], 
      dtype='<U75')

## Array Comparisons Selecting Certain Elements

In [18]:
# ⚠️NumPy compare the second value to each element and returns 'True' or 'False'
# ⚠️The comparisons helps select elements in arrays using Boolean vectors

# select only the rows in world_alcohol where the country is algeria
country_algeria = world_alcohol[world_alcohol[:,2] =='Algeria', :]
country_algeria
# ⚠️ here we want only the rows, so the last sign is ':', which's a little bit different from DataFrame in Pandas

array([['1984', 'Africa', 'Algeria', 'Spirits', '0.01'],
       ['1987', 'Africa', 'Algeria', 'Beer', '0.17'],
       ['1987', 'Africa', 'Algeria', 'Spirits', '0.01'],
       ['1986', 'Africa', 'Algeria', 'Wine', '0.1'],
       ['1984', 'Africa', 'Algeria', 'Other', '0'],
       ['1989', 'Africa', 'Algeria', 'Beer', '0.16'],
       ['1989', 'Africa', 'Algeria', 'Spirits', '0.01'],
       ['1989', 'Africa', 'Algeria', 'Wine', '0.23'],
       ['1986', 'Africa', 'Algeria', 'Spirits', '0.01'],
       ['1984', 'Africa', 'Algeria', 'Wine', '0.12'],
       ['1985', 'Africa', 'Algeria', 'Beer', '0.19'],
       ['1985', 'Africa', 'Algeria', 'Other', '0'],
       ['1986', 'Africa', 'Algeria', 'Beer', '0.18'],
       ['1985', 'Africa', 'Algeria', 'Wine', '0.11'],
       ['1986', 'Africa', 'Algeria', 'Other', '0'],
       ['1989', 'Africa', 'Algeria', 'Other', '0'],
       ['1987', 'Africa', 'Algeria', 'Other', '0'],
       ['1984', 'Africa', 'Algeria', 'Beer', '0.2'],
       ['1985', 'Africa', 'A

##  Array Comparisons with Multiple Conditions

In [29]:
# select rows where the condition is 'is_algeria_and_1986'
# conditions 
is_algeria_and_1986 = (world_alcohol[:,0]=='1986') & (world_alcohol[:,2]=='Algeria')
rows_with_algeria_and_1986 = world_alcohol[is_algeria_and_1986,:]
# ⚠️remember each condition should include brackets 

## Array Comparisons Replacing Values

In [36]:
# Replace  1986 in the first column with the string 2014
rows = (world_alcohol[:,0] == '1986')
world_alcohol[rows,0] ='2014'
# Replace Wine in the fourth column with the string Grog
rows = (world_alcohol[:,3] =='Wine')
world_alcohol[rows,3] ='Grog'

# ⚠️ Firstly find the places, then replace

# Replacing Empty Strings
rows = (world_alcohol[:,4]=='')
world_alcohol[rows,4] = 0
# ⚠️ which equals to (前面是筛选框架，后面是条件)
world_alcohol[:,4][rows] =0

##  Converting Data Types with astype( )

In [44]:
# convert the fifth column to float data type
# remember remove the header
world_alcohol = world_alcohol[1:, :]
alcohol_consumption = world_alcohol[:,4].astype(float)
total_alcohol = sum(alcohol_consumption)
# which equals to 
total_alcohol = alcohol_consumption.sum()
average_alcohol = total_alcohol/len(alcohol_consumption)
# which equals to 
average_alcohol = alcohol_consumption.mean()

## Total Canada_1986 Alcohol Consumption

In [48]:
conditions = (world_alcohol[:,0]=='1986') & (world_alcohol[:,2]=='Canada')
canada_1986 = world_alcohol[conditions,:]
is_empty = (canada_1986[:,4] =='')
canada_1986[is_empty,4] = 0
# which equals to 
canada_1986[:,4][canada_1986[:,4]==''] =0
# converting to float type before sum up
total_canada_drinking = canada_1986[:,4].astype(float).sum()