# Preprocessing with NumPy

In [29]:
import numpy as np
np.set_printoptions(suppress = True)

## Checking for Missing Values
- Missing values can be checked with the help of loadtxt function.
- They can also be checked with np.isnan() function. It returns true for NaN values if there and we can sum that array up.

In [30]:
# This load function works on the files with only numeric values, by default, so if there are missing values we will come 
# to know by the error that will be produced.
lending_co_numeric = np.loadtxt("./Data/Lending-company-Numeric.csv",delimiter = ',')
lending_co_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [31]:
# lending_co_numeric_NaN = np.loadtxt("./Data/Lending-company-Numeric-NAN.csv",delimiter = ';') # Will return an error
lending_co_numeric_NaN = np.genfromtxt("./Data/Lending-company-Numeric-NAN.csv",delimiter = ';')
lending_co_numeric_NaN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [32]:
# np.isnan() method returns an array of True or bool(1) for all the missing values. So if we add them and there are missing 
# values the sum will always be greater than 0, else the sum will be 0.
print(f"Sum of NaN values is {np.isnan(lending_co_numeric).sum()}")
print(f"Sum of NaN values is {np.isnan(lending_co_numeric_NaN).sum()}")
# In the NaN file there are 260 missing values.

Sum of NaN values is 0
Sum of NaN values is 260


# Filling of Missing Values.
- We fill the missing values with the help up of fill_values parameter of the importing function of genfromtxt.
- This will fill all the missing values with a single value, but this function works on all column at once, not on the individual columns, we need to workaround with usecols.
- We fill with a value that is not present in the dataset or that doesn't alter the distribution much. Filling a 0 is not much of a sense so we will fill it with a value greater than the max of the dataset.
- However filling the dataset with a value greater than the max for all missing values, does alter the mean of the individual columns and the filler value may act as an outlier for the column so we use another filler values for each individual columns and thus we use the np.where() function.

In [88]:
lending_co_numeric_without_fill = np.genfromtxt("./Data/Lending-company-Numeric-NAN.csv",
                                        delimiter = ';')
max_fill = np.nanmax(lending_co_numeric_fill)
print(max_fill)
lending_co_numeric_without_fill

64001.0


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [89]:
# Mean for column before filler values.
temp_mean = np.nanmean(lending_co_numeric_fill[:,0])
print(temp_mean)

2250.2477700693757


## Substituting Missing Values

In [394]:
# Mean after filling the max value of the dataset.
lending_co_numeric_fill = np.genfromtxt("./Data/Lending-company-Numeric-NAN.csv",
                                        delimiter = ';',
                                        filling_values=max_fill+1)
# Mean for column before filler values.
temp_mean = np.nanmean(lending_co_numeric_fill[:,0])
print(temp_mean) 
# Thus the filler value changes the  mean totally, so we need to put a value as mean of that particular column.

4263.28092042186


In [395]:
# Using the dataset with max_fill as nan values so that it can be used in the np.where() for applying condition.

# Creating temperory mean for every column 
temp_mean = np.nanmean(lending_co_numeric_without_fill,axis = 0).round(2)

# Substituting the NaN values in a particular column with the mean of the column values itself.
for i in range((lending_co_numeric_fill.shape[1])):
    lending_co_numeric_fill[:,i] = np.where(lending_co_numeric_fill[:,i]==(max_fill+1),
                                            temp_mean[i],
                                            lending_co_numeric_fill[:,i])
    
#Checking the mean after filling each collumns own mean.
np.mean(lending_co_numeric_fill[:,0]) # It comes inline with mean of the column, thus nature of data remains same.
# We can also fill the values with mode, median etc.

2250.2478427612655

In [396]:
lending_co_numeric_fill[(lending_co_numeric_fill[:,0])==np.nan]

array([], shape=(0, 6), dtype=float64)

## Reshaping
- This is a quite useful feature to be done in data manipulation, because certain functions or methods only work on specific size of arrays that requires reshaping an array.
- In reshaping remember that the product of the dimensions need to be conserved always.
- Remember the reshape function doesn't change the original array instead it returns a new object with the made changes.

In [100]:
lending_co_numeric = np.loadtxt('./Data/Lending-company-Numeric.csv',delimiter=',')
lending_co_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [101]:
# Knowing the shape of the original array.
lending_co_numeric.shape

(1043, 6)

In [103]:
# Changing the shape with reshape function.
np.reshape(lending_co_numeric,(6,1043))
# Doing this doesn't transpose the matrix, just it takes first 1043 values and puts it in first row, and continues this 
# operation for the next consecutive 1043 elements.

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [106]:
# Transpose is used to get the transpose of the matrix.
np.transpose(lending_co_numeric)
print(np.transpose(lending_co_numeric).shape)

(6, 1043)


In [108]:
np.reshape(lending_co_numeric,(3,2,1043)) # Creating a 3D array.

array([[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
        [12277.,  2000.,    40., ...,    50.,   365.,  5350.]],

       [[ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
        [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.]],

       [[  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
        [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]])

In [110]:
np.reshape(lending_co_numeric,(1,1,1,1,3,2,1043))
# We can create any dimensional array with reshape, and sometimes we can increase dimension with adding extra ones.
# We can know dimension by the no. of brackets at the start and end of bracket.

array([[[[[[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
            [12277.,  2000.,    40., ...,    50.,   365.,  5350.]],

           [[ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
            [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.]],

           [[  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
            [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]]]]]])

## Removing Values
- For removing the columns or the rows we will use the np.delete() function.
- Using np.delete() function removes the values after they are loaded.

In [112]:
# Loading the database.
lending_co_numeric = np.loadtxt("./Data/Lending-company-Numeric.csv",delimiter = ',')
lending_co_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [114]:
np.delete(lending_co_numeric, 0, axis = None)
# When the axis is none the array is treated as the flat array and entrys with the specified index are deleted.

array([   40.,   365.,  3121., ...,  4601.,  4601., 16600.])

In [117]:
np.delete(lending_co_numeric,0,axis = 1)
# With this we will be deleting the 1st column of the array.

array([[   40.,   365.,  3121.,  4241., 13621.],
       [   40.,   365.,  3061.,  4171., 15041.],
       [   40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  4201.,  5001., 16600.],
       [   40.,   365.,  2080.,  3320., 15600.],
       [   40.,   365.,  4601.,  4601., 16600.]])

In [122]:
np.delete(np.delete(lending_co_numeric,(0,2,5),axis = 1),(0,-1),axis = 0)
# This will delete the given 1,3,6 columns first and then delete the rows from the edited matrix.

array([[  40., 3061., 4171.],
       [  40., 2160., 3280.],
       [  40., 3041., 4241.],
       ...,
       [  40., 4240., 5440.],
       [  40., 4201., 5001.],
       [  40., 2080., 3320.]])

## Sorting Data
- With the np.sort() function we can sort a given array. 
- The result changes with the axis that are provided as the input.
- By default the sorting occurs in the ascending manner and to get descending order there is a workaround without a parameter.

In [139]:
array_a = np.array([[1,2,3,4],[0,1,0,1],[0,1,2,1]])
array_a

array([[1, 2, 3, 4],
       [0, 1, 0, 1],
       [0, 1, 2, 1]])

In [140]:
np.sort(array_a) #By default the axis is -1, which is equivalent to slicing -1. This will choose the last dimension for 
# sorting which is of column of the array. So the data is sorted in each individual row.
np.sort(array_a).shape

(3, 4)

In [141]:
np.sort(array_a, axis = 0)
array_a.shape # Sorting doesn't change the shape of the array.

(3, 4)

In [142]:
np.sort(array_a.flat)

array([0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4])

In [143]:
-np.sort(-array_a,axis = 1) # This sorts the data in descending or the decreasing order.

array([[4, 3, 2, 1],
       [1, 1, 0, 0],
       [2, 1, 1, 0]])

In [144]:
-np.sort(-array_a,axis = 0) # This sorts the array row wise in individual columns.

array([[1, 2, 3, 4],
       [0, 1, 2, 1],
       [0, 1, 0, 1]])

In [145]:
array_a #Above functions doesn't do the sorting inplace.

array([[1, 2, 3, 4],
       [0, 1, 0, 1],
       [0, 1, 2, 1]])

In [147]:
array_a.sort(axis = 1) #This function does the sorting in place adn changes the dataset.
array_a

array([[1, 2, 3, 4],
       [0, 0, 1, 1],
       [0, 1, 1, 2]])

In [137]:
# Similar bits can be worked out with the lending_co_numeric array.

## Argument Functions
- np.argsort() is similar to the sort function, but instead of showing the sorted values in output it shows the sorted index of the values in the output.

### np.argsort()

In [176]:
array_a = np.array([[5,7,6,9],[0,1,0,5],[0,3,6,1]])
array_a

array([[5, 7, 6, 9],
       [0, 1, 0, 5],
       [0, 3, 6, 1]])

In [177]:
np.argsort(array_a,axis = 1)
# For first row, element with 2th index in original array i.e. 6 will come to 1th index in sorted array, likewise element
# with 1th index in original array i.e. 7 will come to 2th index in the sorted array.
# Similar thing will happen for all the array elements per row.

array([[0, 2, 1, 3],
       [0, 2, 1, 3],
       [0, 3, 1, 2]], dtype=int64)

In [181]:
# This will sort the whole array by particular columns by using conditional slicing.
# Its like slicing one column and then making changes on the rest of columns as well.
# We can only use the argsort not sort function cause argsort gives indices, while the sort does not.
array_b = array_a[np.argsort(array_a[:,3],axis = 0)]

In [182]:
array_b

array([[0, 3, 6, 1],
       [0, 1, 0, 5],
       [5, 7, 6, 9]])

In [183]:
array_a

array([[5, 7, 6, 9],
       [0, 1, 0, 5],
       [0, 3, 6, 1]])

In [187]:
array_a.argsort() # This doesn't do the inplace sorting as the ndarray.sort() does..

array([[0, 2, 1, 3],
       [0, 2, 1, 3],
       [0, 3, 1, 2]], dtype=int64)

In [185]:
array_a

array([[5, 7, 6, 9],
       [0, 1, 0, 5],
       [0, 3, 6, 1]])

### np.argwhere()
- This is also a function which returns an array of indices of the elements of the array.
- The default condition in the block is checking whether the element is 0 or not. True is return when the element is non-zero.
- We can also specify different condition as we like.
- We can change the missing values in this way for an ndarray.

In [189]:
lending_co_numeric_NaN = np.genfromtxt('./Data/Lending-company-Numeric-NAN.csv',delimiter = ';')
lending_co_numeric_NaN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [191]:
np.isnan(lending_co_numeric_NaN).sum() # There are total 260 missing values in the array.

260

In [192]:
np.argwhere(lending_co_numeric_NaN) # Since we have a 2D array we get those positions where we have non-Zero values.
# There is no axis function here and this is the way how it works.

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]], dtype=int64)

In [193]:
np.where(lending_co_numeric_NaN) #This where function also returns indices where the values are non-zero just the row
# index are given in the first array while the column index are given in the second array.

(array([   0,    0,    0, ..., 1042, 1042, 1042], dtype=int64),
 array([0, 1, 2, ..., 3, 4, 5], dtype=int64))

In [194]:
array_a

array([[5, 7, 6, 9],
       [0, 1, 0, 5],
       [0, 3, 6, 1]])

In [195]:
np.where(array_a)

(array([0, 0, 0, 0, 1, 1, 2, 2, 2], dtype=int64),
 array([0, 1, 2, 3, 1, 3, 1, 2, 3], dtype=int64))

In [196]:
# We can change the condition to obtain the positions in 2D array where the values are 0
np.argwhere(array_a == False)

array([[1, 0],
       [1, 2],
       [2, 0]], dtype=int64)

In [199]:
np.argwhere(array_a == False) # In argwhere we cant' give the value to be given in the position, like we can give in where.

array([[1, 0],
       [1, 2],
       [2, 0]], dtype=int64)

In [205]:
# To alter the value at those indices...we use a for loop.
for i in np.argwhere(array_a == False):
    array_a[i[0],i[1]] = 23

In [206]:
array_a # So we have replaced the 0 values from our array.

array([[ 5,  7,  6,  9],
       [23,  1, 23,  5],
       [23,  3,  6,  1]])

In [207]:
# Combining np.isnan and np.argwhere to replace the NaN values in an array.
# np.isnan() creates an array with true values in the place where there are missing values.
# So applying np.isnan() on an array and then putting it in np.argwhere will give the positions of indices which are 
# true as per the np.isnan() function. These are those values which are missing, so by default we will get the positions 
# of those arrays which are actually missing.

In [208]:
lending_co_numeric_NaN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [211]:
for i in np.argwhere(np.isnan(lending_co_numeric_NaN)):
    lending_co_numeric_NaN[i[0],i[1]] = 0

In [212]:
# Checking now whether there are any missing values or not.
np.isnan(lending_co_numeric_NaN).sum() #Since sum is 0 so there are no further missing values in our dataset.

0

## Shuffling Data
- With the help up of shuffling data we simply move the rows up and down the dataset.
- This is ensured to have random rows in the dataset and we can choose a sample from this random data.
- However the data in a single row doesn't gets shuffled only the rows move up and down in the dataset.

In [213]:
lending_co_numeric = np.loadtxt("./Data/Lending-Company-Numeric-Data.csv",delimiter = ',')[:8] #Loading the starting 8 rows.
lending_co_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [220]:
np.random.shuffle(lending_co_numeric) # This shuffles the data in place and thus gives no output.

In [221]:
lending_co_numeric #Executing above cell many no. of times shuffles the cell many no. of times.

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.]])

In [224]:
from numpy.random import shuffle
shuffle(lending_co_numeric) #After importing calling becomes easy.
lending_co_numeric

array([[ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.]])

In [236]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

In [247]:
array_RG = gen(pcg())
array_RG.shuffle(lending_co_numeric)
lending_co_numeric

array([[ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.]])

In [256]:
array_RG = gen(pcg(seed  = 365))  # For shuffle fixing a seed also doesn't fix the output, the output still 
# keeps on changing that's the by default way how shuffle works.
array_RG.shuffle(lending_co_numeric)
lending_co_numeric

array([[ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.]])

## Casting
- Casting refers to changing the data type of the elements by using a astype method.
- Astype method doesn't work in place.

In [261]:
lending_co_numeric.astype(np.int32) #converting float to int.

array([[ 2000,    40,   365,  3041,  4241, 15321],
       [ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    50,   365,  1851,  3251, 17701],
       [ 2000,    40,   365,  3971,  4131, 15351],
       [ 1000,    40,   365,  2160,  3280, 15340],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 2000,    50,   365,  3470,  4820, 13720],
       [ 2000,    40,   365,  3201,  4141, 14141]])

In [262]:
lending_co_numeric.astype(np.float64) #converting back to float

array([[ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.]])

In [266]:
lending_co_numeric = lending_co_numeric.astype(str)

In [267]:
lending_co_numeric.astype(np.int32) # We can't change str with '.' to int directly so we convert to float64 and then to int.

ValueError: invalid literal for int() with base 10: '2000.0'

In [270]:
lending_co_numeric = lending_co_numeric.astype(np.float64).astype(np.int32)

In [271]:
lending_co_numeric

array([[ 2000,    40,   365,  3041,  4241, 15321],
       [ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    50,   365,  1851,  3251, 17701],
       [ 2000,    40,   365,  3971,  4131, 15351],
       [ 1000,    40,   365,  2160,  3280, 15340],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 2000,    50,   365,  3470,  4820, 13720],
       [ 2000,    40,   365,  3201,  4141, 14141]])

## Stripping Data
- This refers to removing extra text from strings without removing the whole string.
- We use the np.chararray.strip() function to do this.

In [283]:
lending_co_TP = np.genfromtxt('./Data/Lending-Company-Total-Price.csv',
                              delimiter = ',',
                              dtype = str,
                              skip_header=1,usecols=(1,2,4))
lending_co_TP # We want to remove id, product and location from the data.

array([['id_1', 'Product B', 'Location 2'],
       ['id_2', 'Product B', 'Location 3'],
       ['id_3', 'Product C', 'Location 5'],
       ...,
       ['id_413', 'Product B', 'Location 135'],
       ['id_414', 'Product C', 'Location 200'],
       ['id_415', 'Product A', 'Location 8']], dtype='<U12')

In [286]:
lending_co_TP[:,0] = np.chararray.strip(lending_co_TP[:,0],"id_") #This method is also not inplace so we need to store it.
lending_co_TP[:,1] = np.chararray.strip(lending_co_TP[:,1],"Product ")
lending_co_TP[:,2] = np.chararray.strip(lending_co_TP[:,2],"Location ")

In [287]:
lending_co_TP

array([['1', 'B', '2'],
       ['2', 'B', '3'],
       ['3', 'C', '5'],
       ...,
       ['413', 'B', '135'],
       ['414', 'C', '200'],
       ['415', 'A', '8']], dtype='<U12')

In [295]:
#Now we can replace the strings with integer values or the float values.
# Can't convert like this we need to convert for the whole array at once.
lending_co_TP[:,0] = lending_co_TP[:,0].astype(dtype = np.int32)
lending_co_TP[:,2] = lending_co_TP[:,2].astype(dtype = np.int32)
lending_co_TP

array([['1', 'B', '2'],
       ['2', 'B', '3'],
       ['3', 'C', '5'],
       ...,
       ['413', 'B', '135'],
       ['414', 'C', '200'],
       ['415', 'A', '8']], dtype='<U12')

In [296]:
# Since above cell doesnt' make changes so we change the chars of product to numeric values.
lending_co_TP[:,1] = np.where(lending_co_TP[:,1]=='A',1,lending_co_TP[:,1])
lending_co_TP[:,1] = np.where(lending_co_TP[:,1]=='B',1,lending_co_TP[:,1])
lending_co_TP[:,1] = np.where(lending_co_TP[:,1]=='C',1,lending_co_TP[:,1])
lending_co_TP[:,1] = np.where(lending_co_TP[:,1]=='D',1,lending_co_TP[:,1])
lending_co_TP[:,1] = np.where(lending_co_TP[:,1]=='E',1,lending_co_TP[:,1])
lending_co_TP[:,1] = np.where(lending_co_TP[:,1]=='F',1,lending_co_TP[:,1])
lending_co_TP

array([['1', '1', '2'],
       ['2', '1', '3'],
       ['3', '1', '5'],
       ...,
       ['413', '1', '135'],
       ['414', '1', '200'],
       ['415', '1', '8']], dtype='<U12')

In [299]:
lending_co_TP = lending_co_TP.astype(dtype = np.int32)
lending_co_TP

array([[  1,   1,   2],
       [  2,   1,   3],
       [  3,   1,   5],
       ...,
       [413,   1, 135],
       [414,   1, 200],
       [415,   1,   8]])

## Stacking
- Stacking refers to putting one array above the other when they have similar dimensions, but it all depends what type of stacking is being done.
- There are different functions like np.stack(),np.hstack(),np.vstack(),np.dstack()

In [303]:
lending_co_numeric = np.loadtxt('./Data/Lending-company-Numeric.csv',delimiter = ',',dtype = np.int32)
lending_co_numeric

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]])

In [315]:
np.stack((lending_co_numeric[:,0],lending_co_numeric[:,1]),axis = 0)
#lending_co_numeric[:,1].shape

array([[2000, 2000, 1000, ..., 2000, 1000, 2000],
       [  40,   40,   40, ...,   40,   40,   40]])

In [314]:
np.stack((lending_co_numeric[:,1],lending_co_numeric[:,0]),axis = 1)

array([[  40, 2000],
       [  40, 2000],
       [  40, 1000],
       ...,
       [  40, 2000],
       [  40, 1000],
       [  40, 2000]])

In [319]:
np.stack((lending_co_numeric[:,1],lending_co_numeric[:,:2]),axis = 0) 
# This gives error as the second array have different shape than the first one.
# We can do multiple stacking by specifying each array individually.

ValueError: all input arrays must have the same shape

In [321]:
np.vstack((lending_co_numeric[:,1],lending_co_numeric[:,0]))
# Here we are stacking the arrays one below the other.

array([[  40,   40,   40, ...,   40,   40,   40],
       [2000, 2000, 1000, ..., 2000, 1000, 2000]])

In [331]:
np.hstack((lending_co_numeric[:,:1],lending_co_numeric[:,1:2]))
# we are simply putting the arrays side by side so it became like concatination.

array([[2000,   40],
       [2000,   40],
       [1000,   40],
       ...,
       [2000,   40],
       [1000,   40],
       [2000,   40]])

In [334]:
array_a = np.array([[1,2,3,4],[0,6,8,5],[1,5,2,3]])
array_a

array([[1, 2, 3, 4],
       [0, 6, 8, 5],
       [1, 5, 2, 3]])

In [372]:
np.stack((array_a[0:1,:],array_a[2:3,:]),axis = 2) 

array([[[1, 1],
        [2, 5],
        [3, 2],
        [4, 3]]])

In [373]:
np.hstack((array_a[0,:],array_a[1,:])) 

array([1, 2, 3, 4, 0, 6, 8, 5])

In [374]:
np.vstack((array_a[0:1,:],array_a[2:3,:])) 

array([[1, 2, 3, 4],
       [1, 5, 2, 3]])

In [335]:
array_b = np.array([[[1,2,3,4],[5,6,7,8]],[[0,1,2,0],[2,1,3,4]]])
array_b

array([[[1, 2, 3, 4],
        [5, 6, 7, 8]],

       [[0, 1, 2, 0],
        [2, 1, 3, 4]]])

In [338]:
np.max(array_b,axis = 2)

array([[4, 8],
       [2, 4]])

## Concatenate

In [None]:
# Left do it by watching the videos for the same.

In [411]:
lending_co_numeric_nan = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv',delimiter = ';')
lending_co_numeric_nan

# Filling with max_fill for nan values.
max_fill = np.nanmax(lending_co_numeric_nan)+1
lending_co_numeric_fill = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv',
                                        delimiter = ';',
                                        filling_values=max_fill)
lending_co_numeric_fill

# Filling the max fill with columns individual mean.
temp_mean = np.nanmean(lending_co_numeric_nan,axis = 0).round(2)

for i in range(lending_co_numeric_fill.shape[1]):
    lending_co_numeric_fill[:,i] = np.where(lending_co_numeric_fill[:,i]==max_fill,temp_mean[0],lending_co_numeric_fill[:,i])

In [412]:
lending_co_numeric_fill

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

## Unique 

In [414]:
lending_co_numeric = np.loadtxt('./Data/Lending-company-Numeric.csv',delimiter = ',')
lending_co_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [415]:
np.unique(lending_co_numeric) # This finds the unique values in the whole array and also sorts them in ascending order.

array([-2870., -2550., -2450., ..., 52751., 54625., 64001.])

In [418]:
np.unique(lending_co_numeric[:,1],return_counts = True, return_index = True) 
# This gives the unique values in the 2nd collumn.
# Returning count actually returns how many times the value has been appeared.
# Returning index actually returns the index, when the first time the value appeared.
# By default returning index array comes second, this is seen by 0 value which can be only for index.


(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27], dtype=int64),
 array([  4, 567, 451,  19,   2], dtype=int64))