# Lesson 07 - Numpy Part 2

# Objective

- Dive Deeper into Numpy package

# Aggregations

- These are operations like sum, mean, max, etc...
- Provide a look at the array and get basic statistics


In [None]:
import numpy as np
np.random.seed(1)
a = np.random.randint(-100, 100, (4,5))
print(a)
print(f"Minimum: {a.min()}, maximum: {a.max()}")
print(f"Sum: {a.sum()}")
print(f"Mean: {a.mean()}, standard deviation: {a.std()}")

# Column/Row Aggregations

- All depends on the `axis`

<img src="./img/aggregation.svg"/>

In [None]:
np.random.seed(9)
b=np.random.randint(0, 10, (3,4))
print(b)
print("Column sums:", b.sum(axis=0))
print("Row sums:", b.sum(axis=1))

# Boolean Operations

- Like integer or string comparison
- Compare using a single value or other arrays

In [None]:
import numpy as np
a = np.arange(3)
print(a)
b = np.linspace(start=-1, stop=2, num=3)
print(b)
c = a > b
c

In [None]:
print(c.all())   # were all True
print(c.any())   # was some comparison True

# Boolean Operations

- True value map to value 1
- False value map to value 0

#### Count the number of True/False Values

In [None]:
print(f"Number of True values: {c.sum()}")
print(f"Number of False values: {c.size - c.sum()}")

## Real World Data

- Load in some weather data

In [None]:
import pandas as pd
temps = pd.read_csv(("https://www.cs.helsinki.fi/u/jttoivon/dap/"\
                     "data/fmi/kumpula-weather-2017.csv"))['Air temperature (degC)'].values
temps[:10]

In [None]:
print("Number of days with the temperature below zero:", np.sum(temps < 0))
print("Number of days with the temperature is greater than zero and less than ten:", np.sum((temps > 0) & (temps < 10)))

# Sorting Data

- `np.sort()` sorts and creates a new dataset in memory
- `np.array.sort()` performs the action on the dataset itself

In [None]:
rando = np.random.randint(0,10, (4,4))
rando


In [None]:
np.sort(rando, axis=0) # Column Sort

In [None]:
np.sort(rando) # row sort

# `argsort` a Variant of Sort

- returns the Index of the sorted data instead of an array

In [None]:
sorted_idx = np.argsort(rando)
sorted_idx

# Iris dataset

- The iris dataset contains the following data
    + 50 samples of 3 different species of iris (150 samples total)
- Measurements: sepal length, sepal width, petal length, petal width
- The format for the data: (sepal length, sepal width, petal length, petal width)

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
print(X[:4]) # First Four Rows

In [None]:
iris.feature_names

# Pearson Correlation Coefficient

- measures the linear relationship between two datasets.  
- the assumption is that each dataset is normally distributed

In [None]:
import scipy
scipy.stats.pearsonr(X[0], X[2])

In [None]:
np.corrcoef(X[0], X[2])

# Image Processing with Numpy

- An image is a collection of pixels
- A grayscale image can be represented as as two dimensional array, whose first axis corresponds to the x coordinate of the image and the second axis corresponds to the y coordinate. The array contains at each coordinate pair (x,y) a value, which is typically a float between 0.0 and 1.0, or an integer between 0 and 255. This specifies the level of grayness. For example, if the array contains value 255 at coordinates (0,0), then in the image the pixel at top left is white.

- In color images there is third axis is for the red, green, and blue components of each pixel. For each of these color components there is a value between 0.1 and 1.0 (or between 0 and 255). The combinations of different values for the three components red, green, and blue can result in at least 16.7 million colors.

In [None]:
import matplotlib.pyplot as plt
painting=plt.imread("./img/painting.png")
print(painting.shape)
print(f"The image consists of {painting.shape[0] * painting.shape[1]} pixels")
plt.imshow(painting)

# Image Processing

- Flip the image like reversing a list

In [None]:
plt.imshow(painting[:,::-1])

# Image Processing

- remap pixels to white

In [None]:
painting2 = painting.copy()    # don't mess the original painting!
painting2[0:30, :, :] = 1.0    
plt.imshow(painting2)

# Image Processing

- Since the image is just a collection of numeric values you can do all kinds of things

In [None]:
p3 = painting.copy()
height, width = p3.shape[:2]
m=np.linspace(0,1, width).reshape(1,width,1) # shae the image
result = p3*m      
plt.imshow(result)

In [None]:
img_gray = np.average(p3, weights=[0.299, 0.587, 0.114], axis=2)
plt.imshow(img_gray, cmap=plt.get_cmap('gray'))

# Image Processing Example - Finding Points

#### 1. Create some Random Image with Points

In [None]:
n=5
l=256
im = np.zeros((l,l))
np.random.seed(0)
points = np.random.randint(0, l, (2, n**2))  # sample n*n pixels from the array im
im[points[0], points[1]] = 1
plt.imshow(im)

In [None]:
from scipy import ndimage
im2 = ndimage.gaussian_filter(im, sigma=l/(8.*n))  # blur the image a bit
plt.imshow(im2)

#### 2. Find the Clusters

In [None]:
mask = im2 > im2.mean()     # mask those pixels whose intensity is above mean
label_im, nb_labels = ndimage.label(mask)   # connected components form clusters
print(f"Number of clusters is {nb_labels}")
plt.imshow(label_im)