# NumPy
NumPy arrays more efficient to operate on than lists. We can create arrays with numpy.array(), which may be single or multidimensional arrays. These are built for homogenous numeric data.

## Creating & Iterating through Arrays

#### Declaring NumPy array

In [5]:
import numpy as np
numbers = np.array([2,3,5,7,11])
numbers

array([ 2,  3,  5,  7, 11])

In [7]:
type(numbers)

numpy.ndarray

In [11]:
np.array([[1,2,3],[4,5,6]])

array([[1, 2, 3],
       [4, 5, 6]])

#### Array Dimensions
Determine an np array's dimensions with:
1. ndim (number of dimensions)
2. shape (dimensions)
3. size

In [27]:
integers = np.array([[1,2,3],[4,5,6]])
floats = np.array([0.0,0.1,0.2,0.3,0.4])

print(integers.ndim)
print(floats.ndim)

print(integers.shape)
print(floats.shape)

print(integers.size)

2
1
(2, 3)
(5,)
6


#### Iterating Through Arrays

In [30]:
for row in integers:
    for column in row:
        print(column,end=' ')
    print()

1 2 3 
4 5 6 


In [32]:
for i in integers.flat:
    print(i,end=' ')

1 2 3 4 5 6 

#### Zeros, Ones & Full Arrays
We can create pre-filled arrays full of zeros, ones, or a specified number

In [37]:
print(np.zeros(5))

[0. 0. 0. 0. 0.]


In [41]:
print(np.ones((2,4),dtype=int))

[[1 1 1 1]
 [1 1 1 1]]


In [43]:
print(np.full((3,5),13))

[[13 13 13 13 13]
 [13 13 13 13 13]
 [13 13 13 13 13]]


#### Creating Integer & Float Ranges

In [54]:
# arange(range)
np.arange(5)

array([0, 1, 2, 3, 4])

In [56]:
# arange(begin,end)
np.arange(5,10)

array([5, 6, 7, 8, 9])

In [59]:
# linspace(begin,end,number of increments)
np.linspace(0,1,5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [71]:
# reshape(rows,columns)
np.arange(1,21).reshape(4,5)

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20]])

## Working with Arrays

#### Arithmetic Operations with Arrays

In [82]:
numbers = np.array([1,2,3,4,5])

# Multiplication (element wise - scalar)
print(numbers * 2)

# Exponentiation (element wise - scalar)
print(numbers ** 3)

[ 2  4  6  8 10]
[  1   8  27  64 125]


In [86]:
# updating arrays with incremented vlaue
numbers += 10
print(numbers)

[21 22 23 24 25]


In [88]:
# Arithmetic Between Arrays
numbers_2 = [2,4,5,6,7]
numbers * numbers_2

array([ 42,  88, 115, 144, 175])

#### Universal Functions
Perform element-wise operations:
1. MATH: add, subtract, multiply, divide, remainder, exp, log, sqrt, power
2. TRIGONOMETRY: sin, cos, tan, hypot, arcsin, arccos, arctan
3. COMPARISON: greater, greater_equal, less, less_equal, not_equal, logical_and, logical_or, logical_xor, logical_not, minimum, maximum

In [131]:
numbers = np.array([1,4,9,16,25,36])
numbers_2 = np.arange(1,7) * 10

# sqrt on arrays
print(np.sqrt(numbers))

# adding arrays - same as numbers + numbers_2
print(np.add(numbers,numbers_2))

# multiplying arrays - same as numbers * numbers_2
print(np.multiply(numbers,numbers_2))

[1. 2. 3. 4. 5. 6.]
[11 24 39 56 75 96]
[  10   80  270  640 1250 2160]


#### Comparisons with Arrays

In [91]:
numbers == numbers_2

array([False, False, False, False, False])

In [93]:
numbers >= numbers_2

array([ True,  True,  True,  True,  True])

In [95]:
numbers != numbers_2

array([ True,  True,  True,  True,  True])

In [97]:
numbers < numbers_2

array([False, False, False, False, False])

#### NumPy Array Calculations

In [7]:
grades = np.array([50,84,90,100,87])

# sum
print(grades.sum())

# min
print(grades.min())

# max
print(grades.max())

# mean
print(grades.mean())

# std
print(grades.std())

# variance
print(grades.var())

411
50
100
82.2
16.975276139138355
288.15999999999997


In [135]:
# axis = 0 ~ performs calculation on all row values for each COLUMN
# axis = 1 ~ performs calculation on all column values for each ROW

grades = np.array([[87,96,70],
                  [100,87,90],
                  [94,77,90],
                  [100,81,82]])

# by column
print(grades.mean(axis=0))

# by row
print(grades.mean(axis=1))

[95.25 85.25 83.  ]
[84.33333333 92.33333333 87.         87.66666667]


#### Array Slicing

In [155]:
grades = np.array([[87,96,70],
                  [100,87,90],
                  [94,77,90],
                  [100,81,82]])

# array[row, column]

# selecting a row
print(grades[1],end='\n\n')

# slicing rows
print(grades[0:2],end='\n\n')

# selecting two comma-separated rows
print(grades[[1,3]],end='\n\n')

# slicing a row
print(grades[:,0],end='\n\n')

[100  87  90]

[[ 87  96  70]
 [100  87  90]]

[[100  87  90]
 [100  81  82]]

[ 87 100  94 100]



#### Views (Shallow Copies)
Views act kind of like pointers (C++) in that they "see" data in other objects rather than having their own copies of the data. Two separate arrays, but they share the same data.
- changes made to one DOES affect the other

In [177]:
numbers = np.array([1,4,9,16,25,36])

numbers_2 = numbers.view()

numbers *= 10

print(numbers)
print(numbers_2)

print(id(numbers))
print(id(numbers_2))

[ 10  40  90 160 250 360]
[ 10  40  90 160 250 360]
2377961062544
2377961062448


#### Deep Copies
Deep copies store data independently between two arrays.
- changes made to one does NOT affect the other

In [175]:
numbers = np.array([1,4,9,16,25,36])

numbers_2 = numbers.copy()

numbers *= 10

print(numbers)
print(numbers_2)

[ 10  40  90 160 250 360]
[ 1  4  9 16 25 36]


#### Transpose

In [181]:
# transpose with .T
grades = np.array([[87,96,70],
                  [100,87,90],
                  [94,77,90],
                  [100,81,82]])

print(grades.T)

[[ 87 100  94 100]
 [ 96  87  77  81]
 [ 70  90  90  82]]


#### Horizontal / Vertical Stacking

In [184]:
grades = np.array([[100,96,70],
                 [100,87,90]])

grades2 = np.array([[94,77,90],
                    [100,81,82]])

In [186]:
# Horizontal Stacking
np.hstack((grades,grades2))

array([[100,  96,  70,  94,  77,  90],
       [100,  87,  90, 100,  81,  82]])

In [190]:
# Vertical Stacking
np.vstack((grades,grades2))

array([[100,  96,  70],
       [100,  87,  90],
       [ 94,  77,  90],
       [100,  81,  82]])

# Pandas
Unlike NumPy, Pandas built for mixed data types, customized indexing, missing data, data that's not structured consistently, etc.

## Series

#### Creating a Series

In [11]:
import pandas as pd

grades = pd.Series([87,100,94])
grades

0     87
1    100
2     94
dtype: int64

#### Creating a Series with Custom Indices
Dictionary-style declarations

In [13]:
grades = pd.Series([87,100,94],index=['Wally','Eva','Sam'])
print(grades,end='\n\n')

grades = pd.Series({'Wally':87, 'Eva':100, 'Sam':94})
print(grades)

Wally     87
Eva      100
Sam       94
dtype: int64

Wally     87
Eva      100
Sam       94
dtype: int64


#### Indexing Series with Custom Indices

In [251]:
grades['Eva']

100

#### Series of Strings

In [15]:
hardware = pd.Series(['Hammer', 'Saw', 'Wrench'])

# contains function
print(hardware.str.contains('a'),end='\n\n')

# upper function
print(hardware.str.upper())

0     True
1     True
2    False
dtype: bool

0    HAMMER
1       SAW
2    WRENCH
dtype: object


#### Using Series for Descriptive Statistics

In [235]:
grades = pd.Series([87,100,94])

# length of series (count)
grades.count()

3

In [220]:
# mean
grades.mean()

93.66666666666667

In [222]:
# min
grades.min()

87

In [224]:
# max
grades.max()

100

In [226]:
# standard deviation
grades.std()

6.506407098647712

In [228]:
# variance
grades.var()

42.333333333333336

In [230]:
# all of these:
grades.describe()

count      3.000000
mean      93.666667
std        6.506407
min       87.000000
25%       90.500000
50%       94.000000
75%       97.000000
max      100.000000
dtype: float64

## Data Frames
Can be used for multidimensional arrays

#### Creating a DataFrame from a Dictionary

In [280]:
grades_dict = {'Wally':[87,96,70], 'Eva':[100,87,90],
               'Sam':[94,77,90], 'Katie':[100,81,82],
               'Bob':[83,65,85]}

# creating DataFrame
grades = pd.DataFrame(grades_dict)

print(grades)

   Wally  Eva  Sam  Katie  Bob
0     87  100   94    100   83
1     96   87   77     81   65
2     70   90   90     82   85


In [285]:
# customizing indices with index attribute
grades.index = ['Test1','Test2','Test3']

print(grades)

       Wally  Eva  Sam  Katie  Bob
Test1     87  100   94    100   83
Test2     96   87   77     81   65
Test3     70   90   90     82   85


#### Accessing a DataFrame's Columns

In [289]:
print(grades['Eva'],end='\n\n')
print(grades['Sam'])

Test1    100
Test2     87
Test3     90
Name: Eva, dtype: int64

Test1    94
Test2    77
Test3    90
Name: Sam, dtype: int64


#### Selecting Rows with loc & iloc Attributes

In [294]:
# select a row based on index label
print(grades.loc['Test1'],end='\n\n')

# select a row based on an index number
print(grades.iloc[1])

Wally     87
Eva      100
Sam       94
Katie    100
Bob       83
Name: Test1, dtype: int64

Wally    96
Eva      87
Sam      77
Katie    81
Bob      65
Name: Test2, dtype: int64


#### Slicing & Boolean Indexing

In [298]:
# slice rows using loc & iloc
print(grades.loc['Test1':'Test2'])

# subsets of rows & columns
print(grades.loc['Test1':'Test2', ['Eva','Katie']])

       Wally  Eva  Sam  Katie  Bob
Test1     87  100   94    100   83
Test2     96   87   77     81   65
       Eva  Katie
Test1  100    100
Test2   87     81


In [305]:
# boolean indexing
print(grades[grades >= 90],end='\n\n')
print(grades[(grades >= 80) & (grades < 90)])

       Wally    Eva   Sam  Katie  Bob
Test1    NaN  100.0  94.0  100.0  NaN
Test2   96.0    NaN   NaN    NaN  NaN
Test3    NaN   90.0  90.0    NaN  NaN

       Wally   Eva  Sam  Katie   Bob
Test1   87.0   NaN  NaN    NaN  83.0
Test2    NaN  87.0  NaN   81.0   NaN
Test3    NaN   NaN  NaN   82.0  85.0


In [309]:
# specific row & column using labels 
print(grades.at['Test2','Eva'],end='\n\n')

# specifc row & column using integer indices
print(grades.iat[1,2])

87

77


#### Using DataFrames for Descriptive Statistics

In [312]:
grades.describe()

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
count,3.0,3.0,3.0,3.0,3.0
mean,84.333333,92.333333,87.0,87.666667,77.666667
std,13.203535,6.806859,8.888194,10.692677,11.015141
min,70.0,87.0,77.0,81.0,65.0
25%,78.5,88.5,83.5,81.5,74.0
50%,87.0,90.0,90.0,82.0,83.0
75%,91.5,95.0,92.0,91.0,84.0
max,96.0,100.0,94.0,100.0,85.0


#### Transpose of a DataFrame

In [315]:
grades.T

Unnamed: 0,Test1,Test2,Test3
Wally,87,96,70
Eva,100,87,90
Sam,94,77,90
Katie,100,81,82
Bob,83,65,85


In [319]:
# NOTE: grades.T.mean() == grades.mean(axis=1)
# axis = 0 ~ by columns
# axis = 1 ~ by rows

print(grades.T.mean(),end='\n\n')
print(grades.mean(axis=1))

Test1    92.8
Test2    81.2
Test3    83.4
dtype: float64

Test1    92.8
Test2    81.2
Test3    83.4
dtype: float64


#### Sorting Rows & Columns by Indices

In [324]:
# sorting row by indices (descending)
print(grades.sort_index(ascending=False),end='\n\n')

# sorting column by indices (ascending alphabetically by label)
print(grades.sort_index(axis=1))

       Wally  Eva  Sam  Katie  Bob
Test3     70   90   90     82   85
Test2     96   87   77     81   65
Test1     87  100   94    100   83

       Bob  Eva  Katie  Sam  Wally
Test1   83  100    100   94     87
Test2   65   87     81   77     96
Test3   85   90     82   90     70


In [326]:
# sorting by column values (instead of label)
grades.sort_values(by='Test1',axis=1,ascending=False)

# if we do not specify 'by' then sorting is done by label
# axis (0 by column, 1 by row) must match the 'by'

Unnamed: 0,Eva,Katie,Sam,Wally,Bob
Test1,100,100,94,87,83
Test2,87,81,77,96,65
Test3,90,82,90,70,85
