# **Exploratory Data Analysis (EDA)**
*   Load a dataset
*   statistical summaries
*   data visualization






# Python Libraries for EDA

* numpy
* pandas
* matplotlib





# numpy
NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays

# Array
A numpy array is a grid of values, all of the same type, and is indexed by a tuple of nonnegative integers. The number of dimensions is the rank of the array; the shape of an array is a tuple of integers giving the size of the array along each dimension.

In Numpy dimensions are called axes. The number of axes is rank. For example, the coordinates of a point in 3D space [1, 2, 1] is an array of rank 1, because it has one axis.

In [2]:
import numpy as np

a = np.array([1, 2, 3])   # Create a 1-D (rank 1) array from a list
print(type(a))            # Prints "<class 'numpy.ndarray'>"
print(a)                  # Prints array a
print(a.shape)            # Prints "(3,)"  i.e. 1 dimenion with 3 elements
print(a[0], a[1], a[2])   # Prints "1 2 3"
a[0] = 5                  # Change an element of the array
print(a)                  # Prints "[5, 2, 3]"

bmat = np.array([[1,2,3],[4,5,6]])    # Create a 2D (rank 2) array
#bmat - build a matrix object from a string or nested array
print(bmat.shape)                     # Prints "(2, 3)" 2 dimensions 
print(bmat[0, 0], bmat[0, 1], bmat[1, 0])   # Prints "1 2 4"

<class 'numpy.ndarray'>
[1 2 3]
(3,)
1 2 3
[5 2 3]
(2, 3)
1 2 4


In [2]:
#create a 3D array of shape (3,2,2) i.e 3 2 x 2 matrices
cmat = np.array([[1,2,3],[4,5,6],[7,8,9]])

In [4]:
print(a.dtype)
#dtype - data type objects - decribes how the bytes in the fixed-size block of memory
#corresponding to an array item should be interpreted

int64


In [5]:
R=np.array([1.2,3.5])
print(R.dtype)

float64


In [6]:
#creating float array
Y=np.array([1.4, 4.5],dtype=np.float64)
print(type(Y))
print(Y.dtype)
Y1=np.array([1.4, 4.5],dtype=float)
print(type(Y1))
print(Y1.dtype)

<class 'numpy.ndarray'>
float64
<class 'numpy.ndarray'>
float64


Extracting specific elements from the array

In [18]:
Z=np.array([[1,2,3,11],[4,5,6,23],[7,8,9,56]])
print(Z)
print(Z[0][1])
print(Z[0,:]) # prints first row
#write code to print first column 
print(Z[:,0])
#write code to print submtarix first 2 rows and first 2 columns i.e [[1,2],[4,5]]
print(Z[:2,:2])


[[ 1  2  3 11]
 [ 4  5  6 23]
 [ 7  8  9 56]]
2
[ 1  2  3 11]
[1 4 7]
[[1 2]
 [4 5]]


In [25]:
#print all non-zero elments from matrix H
H=np.array([[1,0,3,0],[0,4,0,6],[0,7,8,9]])
print(H[np.nonzero(H)])

[1 3 4 6 7 8 9]


Numpy also provides many functions to create arrays:

# Array indexing
Numpy offers several ways to index into arrays.

In [28]:
# Create the following 2-D array with shape (3, 4)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]
a = np.array([[ 1 , 2 , 3 , 4], [ 5 , 6 , 7 , 8],[ 9 ,10 ,11, 12]])

#take slice of 0 to 2 from array a 
b =   a[0:2] #0 to 2
print(b)
# A slice of an array is a view into the same data, so modifying it will modify the original array.
print(a[0, 1])   # Prints "2" 
b[0, 0] = 77     # b[0, 0] is the same piece of data as a[0, 1]
print(a[0, 1])   # Prints "77"
print(a)

[[1 2 3 4]
 [5 6 7 8]]
2
2
[[77  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [30]:
# Create a new array from which we will select elements
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])

print(a)  # prints "array([[ 1,  2,  3],
          #                [ 4,  5,  6],
          #                [ 7,  8,  9],
          #                [10, 11, 12]])"

# Create an array of indices
b = np.array([0, 2, 0, 1])

# Select one element from each row of a using the indices in b , hint : use arange
# this is known as 'fancy indexing'
print(a[np.arange(0,4,1),b])  # Prints "[ 1  6  7 11]"



[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[ 1  6  7 11]


In [31]:
a = np.array([[1,2], [3, 4], [5, 6]])

bool_idx = (a>2)     #write your code here to Find the elements of a that are bigger than 2;
                     # this returns a numpy array of Booleans of the same
                     # shape as a, where each slot of bool_idx tells
                     # whether that element of a is > 2.

print(bool_idx)      # Prints "[[False False]
                     #          [ True  True]
                     #          [ True  True]]"

# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx])  # Prints "[3 4 5 6]"

# We can do all of the above in a single concise statement:
print(a[a > 2])     # Prints "[3 4 5 6]"

[[False False]
 [ True  True]
 [ True  True]]
[3 4 5 6]
[3 4 5 6]


# Array Math

In [32]:
X=np.array([1,2,3,4])
X=X+2
print(X)

[3 4 5 6]


In [37]:
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]

#write code here 
print(x+y)
#or
print(np.add(x,y))

# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]

#write code here 
print(x-y)
#or
print(np.subtract(x,y))

# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]

#write code here 
print(x*y)
#or
print(np.multiply(x,y))

# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
#write your code here
print(np.sqrt(x))
print(np.sqrt(y))

[[ 6.  8.]
 [10. 12.]]
[[ 6.  8.]
 [10. 12.]]
[[-4. -4.]
 [-4. -4.]]
[[-4. -4.]
 [-4. -4.]]
[[ 5. 12.]
 [21. 32.]]
[[ 5. 12.]
 [21. 32.]]
[[1.         1.41421356]
 [1.73205081 2.        ]]
[[2.23606798 2.44948974]
 [2.64575131 2.82842712]]


 How to represent missing values and infinite?

 Missing values can be represented using np.nan object, while np.inf represents infinite. Let’s place some in arr2d.

In [38]:
list2 = [[1, 2, 3, 4],[3, 4, 5, 6], [5, 6, 7, 8]]
arr2 = np.array(list2, dtype='float')
# Insert a nan and an inf
arr2[1,1] = np.nan  # not a number
arr2[1,2] = np.inf  # infinite
arr2

array([[ 1.,  2.,  3.,  4.],
       [ 3., nan, inf,  6.],
       [ 5.,  6.,  7.,  8.]])

In [39]:
# Replace nan and inf with -1. Don't use arr2 == np.nan
missing_bool = np.isnan(arr2) | np.isinf(arr2)
arr2[missing_bool] = -1  
arr2

array([[ 1.,  2.,  3.,  4.],
       [ 3., -1., -1.,  6.],
       [ 5.,  6.,  7.,  8.]])

# Compute mean, min, max on the ndarray?

In [45]:
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
print("Mean value is: ", np.mean(a)) #write code
print("Max value is: ", np.amax(a)) #write code
print("Min value is: ", np.amin(a)) #write code
# Row wise and column wise min
print("Column wise minimum: ", np.amin(a, axis=0)) #write code #use np.amin
#axis =0 is column wise
print("Row wise minimum: ", np.amin(a,axis=1)) #write code
#axis = 1 is row wise

Mean value is:  6.5
Max value is:  12
Min value is:  1
Column wise minimum:  [1 2 3]
Row wise minimum:  [ 1  4  7 10]


# Copy array

In [46]:
# Assign portion of arr2 to arr2a. Doesn't really create a new array.
arr2a = arr2[:2,:2]  
arr2a[:1, :1] = 100  # 100 will reflect in arr2
arr2

array([[100.,   2.,   3.,   4.],
       [  3.,  -1.,  -1.,   6.],
       [  5.,   6.,   7.,   8.]])

In [48]:
# Copy portion of arr2 to arr2b
arr2b = arr2[:2, :2].copy()
arr2b[:1, :1] = 101  # 101 will not reflect in arr2
arr2

array([[100.,   2.,   3.,   4.],
       [  3.,  -1.,  -1.,   6.],
       [  5.,   6.,   7.,   8.]])

# Reshaping and Flattening Multidimensional arrays

Reshaping is changing the arrangement of items so that shape of the array changes while maintaining the same number of dimensions.

Flattening, however, will convert a multi-dimensional array to a flat 1d array. And not any other shape.

In [49]:
# Reshape a 3x4 array to 4x3 array
arr2.reshape(4, 3)

array([[100.,   2.,   3.],
       [  4.,   3.,  -1.],
       [ -1.,   6.,   5.],
       [  6.,   7.,   8.]])

In [50]:
# Flatten it to a 1d array
arr2.flatten()

array([100.,   2.,   3.,   4.,   3.,  -1.,  -1.,   6.,   5.,   6.,   7.,
         8.])

# Sequences, Random numbers
The np.arange function comes handy to create customised number sequences as ndarray.

In [51]:
# Lower limit is 0 be default
print(np.arange(5))  

# 0 to 9
print(np.arange(0, 10))  

# 0 to 9 with step of 2
print(np.arange(0, 10, 2))  

# 10 to 1, decreasing order
print(np.arange(10, 0, -1))

[0 1 2 3 4]
[0 1 2 3 4 5 6 7 8 9]
[0 2 4 6 8]
[10  9  8  7  6  5  4  3  2  1]


In [52]:
# One random number between [0,1)
print(np.random.random())

# Random numbers between [0,1) of shape 2,2
print(np.random.random(size=[2,2]))

0.2833188643813158
[[0.449605   0.95255109]
 [0.35538784 0.63808754]]


# Get the unique items and the counts

In [53]:
# Create random integers of size 10 between [0,10)
np.random.seed(100)
arr_rand = np.random.randint(0, 10, size=10)
print(arr_rand)
# Get the unique items and their counts
uniqs, counts = np.unique(arr_rand, return_counts=True)
print("Unique items : ", uniqs)
print("Counts       : ", counts)

[8 8 3 7 7 0 4 2 5 2]
Unique items :  [0 2 3 4 5 7 8]
Counts       :  [1 2 1 1 1 2 2]


# Exercises

1. Create two 3 x 4 arrays A and B; output an array in which every element is an element-wise sum of the arrays A and B

In [54]:
#code for Ex1
a = np.array([[1, 2, 3, 4],[3, 4, 5, 6], [5, 6, 7, 8]])
b = np.array([[5, 6, 7, 8], [1, 2, 3, 4],[3, 4, 5, 6]])
print(a+b)

[[ 6  8 10 12]
 [ 4  6  8 10]
 [ 8 10 12 14]]


2. Create 4 x 4 identity matrix (use eye funciton)


In [55]:
#code for Ex2
np.eye(4, dtype=int)

array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1]])

3. create a 1D array whose values are from sequence 1 to 27. Convert this array into 3D array i.e. 3 x 3 x 3

In [59]:
#code for Ex3
a = np.array(np.arange(1, 28, 1))
a.reshape(3,3,3)

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]],

       [[19, 20, 21],
        [22, 23, 24],
        [25, 26, 27]]])

4. create a 3 x 3 float array Z and concert Convert all the elements of Z from float to integer datatype

In [64]:
#code for Ex4
Z = np.array([[1,2,3],[4,5,6], [7, 8, 9]], dtype = 'float')
print(Z.astype('int'))


[[1 2 3]
 [4 5 6]
 [7 8 9]]


5. a1 = np.array([[1,2,3],
               [4,5,6]])

a2 = np.array([[7,8,9],
               [10,11,12]])

               stack a1 and a2 horizontally i.e. output array should be as follows
               [[ 1  2  3  7  8  9]
               [ 4  5  6 10 11 12]]

In [7]:
#code for Ex5
a1 = np.array([[1,2,3],[4,5,6]])
a2 = np.array([[7,8,9], [10,11,12]])
np.hstack((a1,a2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

6. stack a1 and a2 vertically 
[[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]
 [10 11]]

In [12]:
#code for Ex6
np.vstack((a1,a2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

7. Create a numpy array for the even number sequence from 1 to 100

In [13]:
a1 = np.arange(2,101,2)
a1

array([  2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,
        28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,
        54,  56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,
        80,  82,  84,  86,  88,  90,  92,  94,  96,  98, 100])

8. Find the indexes in which the elements in the 2 arrays A1 and A2 match
A1 = np.array([10,2,3,4,5])

A2 = np.array([10,3,2,4,5])

In [14]:
A1 = np.array([10,2,3,4,5])
A2 = np.array([10,3,2,4,5])
print(np.intersect1d(A1,A2))

[ 2  3  4  5 10]


9. create a 2-by-3 matrix with each and every value equal to 5

In [23]:
a3 = np.ones((2,3), dtype=int)
a3[a3>0] = 5
a3

array([[5, 5, 5],
       [5, 5, 5]])

Output a 5-by-5 array of random integers between 0 (inclusive) and 10 (exclusive); Also print its transpose

In [25]:
np.random.randint(10, size=(5,5))

array([[8, 3, 6, 4, 1],
       [7, 5, 8, 7, 9],
       [4, 6, 4, 3, 2],
       [0, 3, 0, 4, 2],
       [2, 1, 3, 9, 4]])

Output a 3-by-3 array of random numbers following normal distribution

In [27]:
np.random.randn(3,3)

array([[ 1.2990403 ,  0.7715135 , -1.18700279],
       [ 0.76644765, -0.829498  , -0.34290164],
       [-0.76942343,  1.10770723,  0.54796794]])