# Preprocessing with NumPy
Preprocessing with NumPy is an essential step in preparing data for analysis, machine learning, or statistical modeling. It involves cleaning, transforming, and organizing data to ensure it's in a format that's ready for further use. NumPy, with its array-based operations, is a powerful tool for performing data preprocessing tasks efficiently.



In [171]:
import numpy as np

## Checking for Missing Values

In [86]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

## If np.loadtxt() compiles first time, the dataset consists of only numeric values and has no missing data. 

In [87]:
np.isnan(lending_co_data_numeric).sum()

## isnan() determines whether data is missing data for the individual 
# d:\Study\Datascience\Preprocessing Data with NumPy\Lending-company-Numeric-NAN.csvelements in an array (True -> Missing, False -> Not missing)
## By adding .sum(), we get the total number of missing elements in the data. 

np.int64(0)

In [88]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

In [89]:
np.isnan(lending_co_data_numeric_NAN).sum()

np.int64(260)

In [90]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = 0)

## Filling_values substitutes every nan with the value we're passing (0 in this case)

In [91]:
np.isnan(lending_co_data_numeric_NAN).sum()

## All the previously missing values are now 0s.

np.int64(0)

In [92]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';') 

# We need to reimport the dataset since all the missing values are filled up. 

In [93]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
# np.nanmax() calculates the maximum value of the array while ignoring any NaN values.
# This means that if lending_co_data_numeric_NAN contains missing values (NaN), 
# they are not considered when calculating the maximum value. Only the valid numbers are used.

# round(2) This step ensures that the maximum value is rounded to two decimal places, making it easier to handle in further calculations.

# We use nanmax(), since max() returns nan. 
# We want a value greater than the max, since we have be certain it's unique to the dataset.

In [94]:
temporary_fill

np.float64(64002.0)

In [95]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = temporary_fill) 

# Filling up all the missing values with the temporary filler. 

In [96]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [97]:
np.isnan(lending_co_data_numeric_NAN).sum()

np.int64(0)

## Substituting Missing Values

In [98]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [99]:
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

## Storing the means of every column. 

In [100]:
temporary_mean[0]

np.float64(2250.25)

In [None]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1 # max value

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                            delimiter = ';',
                                            filling_values = temporary_fill)

## Creating a unique filler and using it to take care of all the missing values.

In [102]:
temporary_fill

np.float64(64002.0)

In [None]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2) 
# calculate the mean of the first column of a lending_co_data_numeric_NAN),
#  while rounding the result to two decimal places



# Supposed mean (w/ fillers)

np.float64(4263.25)

In [104]:
temporary_mean[0]

# Actual mean (w/0 fillers)

np.float64(2250.25)

np.where() is a conditional function in NumPy that returns elements chosen from two options:

If the condition is True, it selects the value x.

If the condition is False, it selects the value y.

In [None]:
lending_co_data_numeric_NAN[:,0] = np.where(lending_co_data_numeric_NAN[:,0] == temporary_fill,
                                            temporary_mean[0], 
                                            lending_co_data_numeric_NAN[:,0])

# Going through the first column and substituting any temporary fillers (previously missing) 
# with the mean for that column.

# data replacement operation on the first column of the lending_co_data_numeric_NAN array. 
# Specifically, it replaces values in the first column that match a certain value (temporary_fill) with a new value (temporary_mean[0]). 

# This code checks each element in the first column of lending_co_data_numeric_NAN. If an element matches temporary_fill, it will be replaced with the value temporary_mean[0]. Otherwise, it will remain unchanged.

In [106]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

# New mean equals old mean. 

np.float64(2250.25)

In [107]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill, 
                                                temporary_mean[i], 
                                                lending_co_data_numeric_NAN[:,i])
    
# We're generalizing the filling from earlier and going through all the columns. 

In [108]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:, i] < 0,
                                                0, 
                                                lending_co_data_numeric_NAN[:,i])
    
# We can use this approach for other applications as well (e.g. remove all negative values and set them to 0)

## Reshaping
In reshaping, NumPy simply flattens the original array and fills the new shape (6 rows and 1043 columns) in row-major order.

Reshaping an array means changing the shape of the array without modifying its data, but it involves reordering the elements of the array to fit the new shape.
In reshaping, the elements are rearranged in the order they appear, which can result in a different layout of data in memory.
Reshaping can change the number of rows and columns (or more generally, the shape of the array) but doesn't guarantee the same structure of data as before.


In [109]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

In [110]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [111]:
lending_co_data_numeric.shape

(1043, 6)

In [112]:
np.reshape(lending_co_data_numeric, (6,1043))

# Reshaping (1043,6) to (6,1043) is not the same as transposing.

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [113]:
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [None]:
# np.reshape(lending_co_data_numeric, (1,1,2,3,1043))

# We can choose whatever shape we wish as long as the product of the dimensions matches the total number of elements in the array.

array([[[[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
          [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
          [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

         [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
          [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
          [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]]]])

In [115]:
lending_co_data_numeric

# Reshaping doesn't alter the original array. 

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [116]:
lending_co_data_numeric_2 = np.reshape(lending_co_data_numeric, (6,1043))
lending_co_data_numeric_2

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [117]:
lending_co_data_numeric.reshape(6,1043)

# Equivalent method. 

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [118]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

## Removing Values

In [173]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 

In [174]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [175]:
lending_co_data_numeric.shape

(1043, 6)

In [None]:
np.delete(lending_co_data_numeric, 0).shape

# If you do not specify the axis parameter, np.delete() will flatten the array before performing the deletion. This means that the entire array is considered as a 1D array, and the deletion will remove an element from this 1D version of the array.

# As a result, the array is flattened, and the first element (index 0) is removed, which will affect the total number of elements in the array. The total number of elements decreases by one.

# Removes the first value of the flattened array. 

(6257,)

In [122]:
lending_co_data_numeric.size

6258

In [123]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [124]:
np.delete(lending_co_data_numeric, [0,2,4] , axis = 1)

# By setting an axis, we can simultaneously delete entire rows or columns. 

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

In [125]:
np.delete(np.delete(lending_co_data_numeric, [0,2,4] , axis = 1), [0,2,-1] , axis = 0)

# We can simultaneously delete rows AND columns. 

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

## Sorting Data

In [126]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
np.sort(lending_co_data_numeric).shape # flatten array short and reshape

(1043, 6)

In [128]:
lending_co_data_numeric.shape

(1043, 6)

In [None]:
np.sort(lending_co_data_numeric, axis = None) # flatten arrya sort

array([-2870., -2870., -2550., ..., 54625., 54625., 64001.])

In [None]:
np.set_printoptions(suppress = True)

# Supresses scientific notatoin when printing. 
# suppress=True: This option prevents small floating-point numbers (those that are very close to zero) from being printed in scientific notation. Instead, they will be displayed as regular decimal number

In [131]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [132]:
-np.sort(-lending_co_data_numeric)

## Adding two minus signs sorts the array in descending order

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [133]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [134]:
lending_co_data_numeric.sort(axis = 0)
lending_co_data_numeric

# The equivalent method stores the values in place. 

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

## Argument Functions

### np.argsort()

In [135]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
np.argsort(lending_co_data_numeric)

# np.argsort() is a function in NumPy that returns the indices that would sort an array. In other words, it does not sort the array itself, but instead, it provides the indices that can be used to sort the array.

# The resulting array from np.argsort() is a 1D array of indices, and these indices represent the order in which the original elements would need to be arranged in ascending order.

# Returns the order which will sort the array. 

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]])

In [137]:
np.sort(lending_co_data_numeric, axis = 0)

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [138]:
np.argsort(lending_co_data_numeric, axis = 0)

array([[1041,  816, 1039,   32,   32,  482],
       [  22,  432, 1038,  166,  166,  493],
       [  23,  443, 1037,   85,   85,  166],
       ...,
       [ 978,  152, 1041,  568, 1019,  568],
       [ 193,   27, 1042,  718, 1033,  953],
       [ 202,  408,    0,  912,  912,   27]])

In [139]:
lending_co_data_numeric[482,5]

np.float64(-350.0)

In [140]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric

# Sorts the array based on the values in the 1st column. 

array([[ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 1000.,    50.,   365.,  1400.,  2550., 12700.],
       [ 1000.,    40.,   365.,  2240.,  4560., 14520.],
       ...,
       [ 9000.,   125.,   365., 13001., 17001., 54625.],
       [ 9000.,   125.,   365., 12501., 16001., 52751.],
       [ 9000.,   125.,   365., 12270., 16070., 45745.]])

In [141]:
lending_co_data_numeric.argsort(axis = 0)

# The method doesn't sort in place. 

array([[   0,  216, 1039,    4,    4,  160],
       [   1,  181, 1038,  145,  145,  163],
       [   2,  173, 1037,   67,   67,  145],
       ...,
       [1042, 1040, 1041, 1031, 1028, 1022],
       [1028, 1033, 1042,  890, 1036, 1036],
       [1027, 1025,    0, 1024, 1024, 1025]])

In [142]:
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 1000.,    50.,   365.,  1400.,  2550., 12700.],
       [ 1000.,    40.,   365.,  2240.,  4560., 14520.],
       ...,
       [ 9000.,   125.,   365., 13001., 17001., 54625.],
       [ 9000.,   125.,   365., 12501., 16001., 52751.],
       [ 9000.,   125.,   365., 12270., 16070., 45745.]])

### np.argwhere()

In [143]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [144]:
np.argwhere(lending_co_data_numeric == False)

# Default condition is to return values different from 0. 

array([[116,   4],
       [430,   3]])

In [145]:
lending_co_data_numeric[430]

array([1000.,   50.,  365.,    0.,  550., 5650.])

In [146]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [147]:
np.argwhere(lending_co_data_numeric %2 == 0)

# The condition can be more complex 

array([[   0,    0],
       [   0,    1],
       [   1,    0],
       ...,
       [1042,    0],
       [1042,    1],
       [1042,    5]])

In [148]:
np.isnan(lending_co_data_numeric).sum()

np.int64(0)

In [149]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';') 
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [150]:
np.argwhere(np.isnan(lending_co_data_numeric_NAN))

# Returns the coordinates of all the missing values within the array. 

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

In [151]:
lending_co_data_numeric_NAN[175]

array([ 2000.,    nan,    nan,  1851.,  3051., 13561.])

In [152]:
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
    lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0

## By going through the coordinates of all the mising values of the array, we can fill them up. 

In [153]:
lending_co_data_numeric_NAN[175]

array([ 2000.,     0.,     0.,  1851.,  3051., 13561.])

In [154]:
np.isnan(lending_co_data_numeric_NAN).sum()

np.int64(0)

## Shuffling Data

In [155]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')[:8]
lending_co_data_numeric

# We can directly index the output of the np.loadtxt() function to only take certain parts of the dataset. 

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [156]:
np.random.shuffle(lending_co_data_numeric)

# Shuffles the array (and automatically overwrites it).

In [157]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.]])

In [158]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric

# We can shuffle the array as many times as we wish (although 1 usually suffices).

array([[ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.]])

In [159]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

# We can now use the entire dataset. 

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [160]:
from numpy.random import shuffle

# We can import functions we use multiple times for convenience. 

In [161]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

# We write shuffle() instead of numpy.random.shuffle() since we imported the function earlier. 

array([[ 2000.,    50.,   365.,  3750.,  5250., 20250.],
       [ 2000.,    40.,   365.,  3081.,  4281., 16600.],
       [ 1000.,    35.,   365.,  1315.,  1420.,  3240.],
       ...,
       [ 4000.,    50.,   365.,  5730.,  7180., 22250.],
       [ 2000.,    40.,   365.,  3400.,  4600., 16600.],
       [ 4000.,    50.,   365.,  5450.,  6850., 22250.]])

In [162]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

# Random generators can be used for shuffling. 

In [163]:
array_RG = gen(pcg(seed = 365))
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

# Seeds don't work for shuffling (and it's intended).

array([[ 4000.,    50.,   365.,  8960., 12950., 22250.],
       [ 2000.,    40.,   365.,  3181.,  4271., 13146.],
       [ 2000.,    40.,   365.,  3401.,  4281., 10861.],
       ...,
       [ 2000.,    40.,   365., 10201., 11201., 16600.],
       [ 2000.,    40.,   365.,  3401.,  4601., 16600.],
       [ 1000.,    50.,   365.,   750.,  2100., 12100.]])

## Casting

In [164]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [165]:
lending_co_data_numeric.astype(dtype = np.int32)

# Creates an integer version of the array. 

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [166]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = str)

# We need to overwrite the variable in order to work with strings. 

In [167]:
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [168]:
type(lending_co_data_numeric)

numpy.ndarray

In [169]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.float32)
lending_co_data_numeric.astype(dtype = np.int32)

## We can't directly cast strings to integers. We can go through floats (string -> float -> integer).

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [170]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)
lending_co_data_numeric

# To showcase the other way to go from strings to integers, we need to get the strings version of the array once again. 

  lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)


AttributeError: module 'numpy' has no attribute 'str'.
`np.str` was a deprecated alias for the builtin `str`. To avoid this error in existing code, use `str` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.str_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
lending_co_data_numeric.astype(dtype = np.float32).astype(dtype = np.int32)
lending_co_data_numeric

## We can chain methods in NumPy.

## Stripping Data

In [None]:
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv",
                                       delimiter = ',',
                                       dtype = np.str,
                                       skip_header = 1, 
                                       usecols = [1,2,4])
lending_co_total_price

# We don't neeed the entire array. We only want a few columns to showcase how stripping data works.

In [None]:
lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")
lending_co_total_price

# Remove "id_" from the 1st column, as well as "Product " from the second and "Location " from the third one. 

In [None]:
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'A', 1, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'B', 2, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'C', 3, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'D', 4, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'E', 5, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'F', 6, lending_co_total_price[:,1]) 

lending_co_total_price

# We can combine stripping with substituting to transform all the letters in numbers. 

In [None]:
lending_co_total_price = lending_co_total_price.astype(dtype = np.int32)
lending_co_total_price

# Even though the values look like numbers, they're actually just text, so we need to cast them once again. 

## Stacking

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
# Recall

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
lending_co_data_numeric_NAN


## We create a filler, reimport and fill all the nan-s, then subsitute all the temporary fillers with more appropriate values

In [None]:
np.stack((lending_co_data_numeric[:,1],lending_co_data_numeric[:,0]))

# Stacking the first 2 columns. (We can stack them in any order we like)

In [None]:
np.transpose(lending_co_data_numeric[:,:2])

In [None]:
np.stack((lending_co_data_numeric[:,0],lending_co_data_numeric[:,1], lending_co_data_numeric[:,2]), axis = 1)

# We can stack more than 2 arrays. 

In [None]:
lending_co_data_numeric_NAN.shape

In [None]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0,:,0]

# We can stack 2-D arrays as well. 

In [None]:
np.stack((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = -1)

# We can stack along a given axis (with np.stack())

In [None]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We're quickly creating some 3-D arrays to showcase how dstack works for higher dimensions. 

In [None]:
np.dstack((array_example_1, array_example_2)).shape

In [None]:
np.stack((array_example_1, array_example_2), axis = 2).shape

# We can no longer replicate the output of dstack by simply specifying an axis. 

## Concatenate

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[1,:]))

# The concatenated array has the same number of dimensions as the inputs. 

In [None]:
#Recall: 
    
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
    
lending_co_data_numeric_NAN

In [None]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1).shape

In [None]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We create 3-D arrays to showcase concatenate vs stacking

In [None]:
np.concatenate((array_example_1, array_example_2), axis = 2)

In [None]:
np.dstack((array_example_1, array_example_2))

In [None]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric[:,:1]), axis = 1)

## Unique 

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
np.unique(lending_co_data_numeric[:,1], return_counts = True, return_index = True)

# Unique -> returns the unique values within the array in increasing order
# return_counts -> returns how many times each unique value appears in the array
# return_index -> returns the index of the first encounter with each unique value

In [None]:
array_example = np.array(["a1", "a3","A1","A3","A3","AA1","B1","A2","B1","A2","B2","B2", "B3","a2","a3","B3","B3","a3" ])
np.unique(array_example)

# If the values of the array are text, the unique function sorts them in "alphabetical" order by their ASCII codes. 