# Preprocessing with NumPy

In [1]:
import numpy as np

## Checking for Missing Values

In [3]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
    
## If np.loadtxt() compiles first time, the dataset consists of only numeric values and has no missing data. 

In [5]:
np.isnan(lending_co_data_numeric)    #2d array

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [6]:
np.isnan(lending_co_data_numeric).sum() #if 0 no missing values

0

In [11]:
#lending_co_data_numeric_nan = np.loadtxt("Lending-company-Numeric-NAN.csv", delimiter = ';') #error due to missing data 

In [12]:
lending_co_data_numeric_nan = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';') #error due to missing data 

In [13]:
np.isnan(lending_co_data_numeric_nan)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [14]:
np.isnan(lending_co_data_numeric_nan).sum()

260

In [15]:
lending_co_data_numeric_nan = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = 0)

In [16]:
np.isnan(lending_co_data_numeric_nan).sum()

0

In [17]:
lending_co_data_numeric_nan = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';')

In [19]:
temporary_fill = np.nanmax(lending_co_data_numeric_nan).round(2) + 1

In [20]:
temporary_fill

64002.0

In [21]:
lending_co_data_numeric_nan = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = temporary_fill)

In [22]:
np.isnan(lending_co_data_numeric_nan).sum()

0

## Substituting Missing Values

In [36]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

## Storing the means of every column. 

In [38]:
temporary_mean[0]

2250.25

In [39]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ";",
                                           filling_values= temporary_fill)

In [40]:
temporary_fill

64002.0

In [42]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

4263.25

In [43]:
temporary_mean[0]

2250.25

In [45]:
lending_co_data_numeric_NAN[:,0] = np.where(lending_co_data_numeric_NAN[:,0] == temporary_fill,
                                           temporary_mean[0],
                                           lending_co_data_numeric_NAN[:,0])

In [46]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

2250.25

In [47]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill, 
                                                temporary_mean[i], 
                                                lending_co_data_numeric_NAN[:,i])
    
# We're generalizing the filling from earlier and going through all the columns. 

In [48]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:, i] < 0,
                                                0, 
                                                lending_co_data_numeric_NAN[:,i])
    
# We can use this approach for other applications as well (e.g. remove all negative values and set them to 0)

## Reshaping

In [49]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

In [50]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [52]:
lending_co_data_numeric.shape  
#2d array with 1043 rows and 6 columns 

(1043, 6)

In [53]:
np.reshape(lending_co_data_numeric, (6,1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [54]:
lending_co_data_numeric.transpose()

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [57]:
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [60]:
np.reshape(lending_co_data_numeric, (3,2086) )

array([[ 2000.,    40.,   365., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  4601.,  4601., 16600.]])

In [61]:
np.reshape(lending_co_data_numeric, (2,3,1043))

array([[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
        [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
        [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

       [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
        [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
        [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]])

In [63]:
np.reshape(lending_co_data_numeric, (1,1,2,3,1043))

array([[[[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
          [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
          [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

         [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
          [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
          [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]]]])

In [64]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [66]:
lending_co_data_numeric_2 = np.reshape(lending_co_data_numeric, (6,1043))
lending_co_data_numeric_2

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [67]:
lending_co_data_numeric.reshape(6,1043)

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

## Removing Values

In [68]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [69]:
np.delete(lending_co_data_numeric, 0)

array([   40.,   365.,  3121., ...,  4601.,  4601., 16600.])

In [71]:
np.delete(lending_co_data_numeric, 0).shape
#we have created a 1d array with 6257 elements 

(6257,)

In [72]:
lending_co_data_numeric.size

6258

In [74]:
lending_co_data_numeric
#changes didn't get saved , store them in a new variable

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [76]:
np.delete(lending_co_data_numeric, 0, axis = 0)
#we have removed the first row

array([[ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [77]:
np.delete(lending_co_data_numeric, 0, axis = 1)
#we have removed the first column

array([[   40.,   365.,  3121.,  4241., 13621.],
       [   40.,   365.,  3061.,  4171., 15041.],
       [   40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  4201.,  5001., 16600.],
       [   40.,   365.,  2080.,  3320., 15600.],
       [   40.,   365.,  4601.,  4601., 16600.]])

In [78]:
np.delete(lending_co_data_numeric, 1, axis = 1)
#we have removed the second column

array([[ 2000.,   365.,  3121.,  4241., 13621.],
       [ 2000.,   365.,  3061.,  4171., 15041.],
       [ 1000.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,   365.,  4201.,  5001., 16600.],
       [ 1000.,   365.,  2080.,  3320., 15600.],
       [ 2000.,   365.,  4601.,  4601., 16600.]])

In [80]:
np.delete(lending_co_data_numeric,(0,2,4), axis = 1)
#we have removed the first, third and fifth column

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

In [81]:
np.delete(lending_co_data_numeric,[0,2,4], axis = 1) # can also be used in a list 
#we have removed the first, third and fifth column   

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

In [86]:
np.delete(np.delete(lending_co_data_numeric, [0,2,4] , axis = 1 ), [0,2,-1], axis = 0) 
#we can delete columns and rows with a single command

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

## Sorting Data

In [88]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ",")
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [89]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [91]:
np.sort(lending_co_data_numeric).shape

(1043, 6)

In [92]:
lending_co_data_numeric.shape

(1043, 6)

In [93]:
np.sort(lending_co_data_numeric, axis = 0)

array([[ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.8700e+03, -2.8700e+03,
        -3.5000e+02],
       [ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.5500e+03, -2.1000e+03,
         1.5000e+02],
       [ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.4500e+03, -2.0000e+03,
         1.1000e+03],
       ...,
       [ 9.0000e+03,  1.2500e+02,  3.6500e+02,  1.6751e+04,  1.8751e+04,
         5.4625e+04],
       [ 9.0000e+03,  1.6500e+02,  3.6500e+02,  1.7650e+04,  2.0001e+04,
         5.4625e+04],
       [ 9.0000e+03,  1.6500e+02,  3.6500e+02,  1.9001e+04,  2.2001e+04,
         6.4001e+04]])

In [95]:
np.set_printoptions(suppress = True)
#applies to entire file and not to a single cell

In [96]:
np.sort(lending_co_data_numeric, axis = 0)
#LOWEST values in the columns going down to highest in the last row of each column.

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [97]:
np.sort(lending_co_data_numeric, axis = None)
#axis =none forces the function to work with the flattened version of the 2d input

array([-2870., -2870., -2550., ..., 54625., 54625., 64001.])

In [98]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [99]:
np.sort(-lending_co_data_numeric)

array([[-13621.,  -4241.,  -3121.,  -2000.,   -365.,    -40.],
       [-15041.,  -4171.,  -3061.,  -2000.,   -365.,    -40.],
       [-15340.,  -3280.,  -2160.,  -1000.,   -365.,    -40.],
       ...,
       [-16600.,  -5001.,  -4201.,  -2000.,   -365.,    -40.],
       [-15600.,  -3320.,  -2080.,  -1000.,   -365.,    -40.],
       [-16600.,  -4601.,  -4601.,  -2000.,   -365.,    -40.]])

In [101]:
-np.sort(-lending_co_data_numeric)  #Descending order

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [102]:
-np.sort(lending_co_data_numeric)

array([[   -40.,   -365.,  -2000.,  -3121.,  -4241., -13621.],
       [   -40.,   -365.,  -2000.,  -3061.,  -4171., -15041.],
       [   -40.,   -365.,  -1000.,  -2160.,  -3280., -15340.],
       ...,
       [   -40.,   -365.,  -2000.,  -4201.,  -5001., -16600.],
       [   -40.,   -365.,  -1000.,  -2080.,  -3320., -15600.],
       [   -40.,   -365.,  -2000.,  -4601.,  -4601., -16600.]])

In [103]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [104]:
np.sort(lending_co_data_numeric[:,3])
#1d array containing the values of the 4th column of the dataset in ascending order.

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

In [106]:
#lending_co_data_numeric[:,3] = np.sort(lending_co_data_numeric[:,3])
lending_co_data_numeric[:,3].sort()
lending_co_data_numeric

array([[ 2000.,    40.,   365., -2870.,  4241., 13621.],
       [ 2000.,    40.,   365., -2550.,  4171., 15041.],
       [ 1000.,    40.,   365., -2450.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., 16751.,  5001., 16600.],
       [ 1000.,    40.,   365., 17650.,  3320., 15600.],
       [ 2000.,    40.,   365., 19001.,  4601., 16600.]])

In [107]:
lending_co_data_numeric.sort( axis = 0 )
lending_co_data_numeric

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

## Argument Functions

### np.argsort()

In [108]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [109]:
np.argsort(lending_co_data_numeric)

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]], dtype=int64)

In [110]:
np.argsort(lending_co_data_numeric,axis=0)

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]], dtype=int64)

In [111]:
lending_co_data_numeric[482,5]

-350.0

In [112]:
np.sort(lending_co_data_numeric,axis=0)

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [113]:
np.argsort(lending_co_data_numeric[:,0])

array([537, 639, 849, ...,  27, 277, 420], dtype=int64)

In [114]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric

# Sorts the array based on the values in the 1st column. 

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [115]:
lending_co_data_numeric.argsort(axis = 0)

array([[   0,   22,    0,  199,  199,  172],
       [ 155,   62,  687,   53,   53,  160],
       [ 156,   38,  688,  169,  169,   53],
       ...,
       [1022, 1042,  355, 1024, 1037, 1023],
       [1031, 1039,  357,  941, 1029, 1024],
       [1042, 1040, 1042, 1027, 1027, 1040]], dtype=int64)

In [116]:
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

### np.argwhere()

In [117]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [118]:
np.argwhere(lending_co_data_numeric)

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]], dtype=int64)

In [119]:
np.argwhere(lending_co_data_numeric == False)

array([[116,   4],
       [430,   3]], dtype=int64)

In [120]:
lending_co_data_numeric[116]

array([ 1000.,    50.,   365., -1450.,     0., 13850.])

In [121]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
np.argwhere(lending_co_data_numeric > 1000)

In [123]:
np.isnan(lending_co_data_numeric).sum()

0

In [124]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [127]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [131]:
np.argwhere(np.isnan(lending_co_data_numeric_NAN))

AttributeError: 'numpy.ndarray' object has no attribute 'supress'

In [129]:
lending_co_data_numeric_NAN[175]

array([ 2000.,    nan,    nan,  1851.,  3051., 13561.])

In [133]:
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
    lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0

In [134]:
lending_co_data_numeric_NAN[175]

array([ 2000.,     0.,     0.,  1851.,  3051., 13561.])

In [135]:
np.isnan(lending_co_data_numeric_NAN).sum()

0

## Shuffling Data

In [138]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')[:8]
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [139]:
np.random.shuffle(lending_co_data_numeric)

In [140]:
lending_co_data_numeric

array([[ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.]])

In [141]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.]])

In [142]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [144]:
from numpy.random import shuffle
#we dont have to write np.random now, only shuffle

In [152]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 4000.,    50.,   365.,  6000.,  8000., 22250.],
       [ 2000.,    50.,   365.,  3521.,  4871., 20250.],
       [ 2000.,    40.,   365.,  3301.,  4741., 16600.],
       ...,
       [ 2000.,    50.,   365.,  2651.,  3151.,  6801.],
       [ 2000.,    40.,   365.,  3401.,  4601., 10681.],
       [ 2000.,    40.,   365.,  3401.,  4301., 16600.]])

In [153]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

# Random generators can be used for shuffling. 

In [156]:
array_RG = gen(pcg(seed = 365))
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 4000.,    50.,   365.,  5500.,  7000., 22250.],
       [ 1000.,    40.,   365.,  2200.,  3470., 15600.],
       ...,
       [ 2000.,    40.,   365.,  4600.,  6900., 14400.],
       [ 2000.,    40.,   365.,  3401.,  4601., 16600.],
       [ 2000.,    40.,   365.,  3280.,  4320., 16420.]])

## Casting

In [2]:
import numpy as np
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [3]:
lending_co_data_numeric.astype(dtype = np.int32)   #decimals where removed and replaced with an int

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]])

In [7]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)
lending_co_data_numeric 
#all the arrays look like floats but they are integers and have quotation marks around them  

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)


array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [8]:
type(lending_co_data_numeric)

numpy.ndarray

In [9]:
#they cannot be converted to decimals DUE TO THE DOT POINT.
#lending_co_data_numeric.astype(dtype = np.int32)

ValueError: invalid literal for int() with base 10: '2000.0'

In [10]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.float32) #float first 
lending_co_data_numeric.astype(dtype = np.int32) #integer

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]])

In [13]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)
lending_co_data_numeric

# To showcase the other way to go from strings to integers, 
#we need to get the strings version of the array once again. 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)


array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [11]:
lending_co_data_numeric.astype(dtype = np.float32).astype(dtype = np.int32) 
#doesnt overwrite the contents of our object.

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]])

In [14]:
lending_co_data_numeric.astype(dtype = np.float32)

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]], dtype=float32)

In [15]:
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

## Stripping Data

In [16]:
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv",
                                       delimiter = ',',
                                       dtype = np.str,
                                       skip_header = 1, 
                                       usecols = [1,2,4])
lending_co_total_price

# We don't neeed the entire array. We only want a few columns to showcase how stripping data works.

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = np.str,


array([['id_1', 'Product B', 'Location 2'],
       ['id_2', 'Product B', 'Location 3'],
       ['id_3', 'Product C', 'Location 5'],
       ...,
       ['id_413', 'Product B', 'Location 135'],
       ['id_414', 'Product C', 'Location 200'],
       ['id_415', 'Product A', 'Location 8']], dtype='<U12')

In [21]:
lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")
lending_co_total_price

array([['1', 'B', '2'],
       ['2', 'B', '3'],
       ['3', 'C', '5'],
       ...,
       ['413', 'B', '135'],
       ['414', 'C', '200'],
       ['415', 'A', '8']], dtype='<U12')

In [23]:
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'A', 1, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'B', 2, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'C', 3, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'D', 4, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'E', 5, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'F', 6, lending_co_total_price[:,1]) 

lending_co_total_price


array([['1', '2', '2'],
       ['2', '2', '3'],
       ['3', '3', '5'],
       ...,
       ['413', '2', '135'],
       ['414', '3', '200'],
       ['415', '1', '8']], dtype='<U12')

In [24]:
lending_co_total_price = lending_co_total_price.astype(dtype = np.int32)
lending_co_total_price

array([[  1,   2,   2],
       [  2,   2,   3],
       [  3,   3,   5],
       ...,
       [413,   2, 135],
       [414,   3, 200],
       [415,   1,   8]])

## Stacking

In [25]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [36]:
# Recall

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
lending_co_data_numeric_NAN


## We create a filler, reimport and fill all the nan-s, then subsitute all the temporary fillers with more appropriate values

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [28]:
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1])) #CHECK ORDER

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [31]:
np.stack((lending_co_data_numeric[:,1], lending_co_data_numeric[:,0])) #CHECK ORDER

array([[  40.,   40.,   40., ...,   40.,   40.,   40.],
       [2000., 2000., 1000., ..., 2000., 1000., 2000.]])

In [32]:
np.transpose(lending_co_data_numeric[:,:2])

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [33]:
np.stack((lending_co_data_numeric[:,1], lending_co_data_numeric[:,0]), axis=1 ) #CHECK ORDER

array([[  40., 2000.],
       [  40., 2000.],
       [  40., 1000.],
       ...,
       [  40., 2000.],
       [  40., 1000.],
       [  40., 2000.]])

In [34]:
np.stack((lending_co_data_numeric[:,1], lending_co_data_numeric[:,0], lending_co_data_numeric[:,2]), axis=1 ) #CHECK ORDER

array([[  40., 2000.,  365.],
       [  40., 2000.,  365.],
       [  40., 1000.,  365.],
       ...,
       [  40., 2000.,  365.],
       [  40., 1000.,  365.],
       [  40., 2000.,  365.]])

In [35]:
#ERROR
#np.stack((lending_co_data_numeric[:,1], lending_co_data_numeric[:,0], lending_co_data_numeric[:,:2]), axis=1 ) 

ValueError: all input arrays must have the same shape

In [37]:
lending_co_data_numeric_NAN.shape

(1043, 6)

In [39]:
np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN)) #we can't check if done correctly so we can .shape

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [40]:
np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape #to check if stacked correctly

(2086, 6)

In [41]:
np.hstack((lending_co_data_numeric, lending_co_data_numeric_NAN)) #we can't check if done correctly so we can .shape

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

In [42]:
np.hstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape #to check if stacked correctly

(1043, 12)

In [43]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3121.  ,  3121.  ],
        [ 4241.  ,  4241.  ],
        [13621.  , 13621.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3061.  ,  3061.  ],
        [ 4171.  ,  4171.  ],
        [15041.  , 15041.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2160.  ,  2160.  ],
        [ 3280.  ,  3280.  ],
        [15340.  , 15340.  ]],

       ...,

       [[ 2000.  ,  2250.25],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 4201.  ,  4201.  ],
        [ 5001.  ,  5001.  ],
        [16600.  , 16600.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2080.  ,  2080.  ],
        [ 3320.  ,  3320.  ],
        [15600.  , 15600.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  

In [44]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape

(1043, 6, 2)

In [47]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0,:,0]

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.])

In [48]:
np.stack((lending_co_data_numeric, lending_co_data_numeric_NAN), axis= -1) #same as dstack
#only works with 2 x 2D array

array([[[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3121.  ,  3121.  ],
        [ 4241.  ,  4241.  ],
        [13621.  , 13621.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3061.  ,  3061.  ],
        [ 4171.  ,  4171.  ],
        [15041.  , 15041.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2160.  ,  2160.  ],
        [ 3280.  ,  3280.  ],
        [15340.  , 15340.  ]],

       ...,

       [[ 2000.  ,  2250.25],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 4201.  ,  4201.  ],
        [ 5001.  ,  5001.  ],
        [16600.  , 16600.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2080.  ,  2080.  ],
        [ 3320.  ,  3320.  ],
        [15600.  , 15600.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  

In [49]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We're quickly creating some 3-D arrays to showcase how dstack works for higher dimensions. 

In [50]:
np.dstack((array_example_1, array_example_2)).shape

(2, 3, 8)

In [51]:
np.stack((array_example_1, array_example_2), axis = 2).shape

# We can no longer replicate the output of dstack by simply specifying an axis. 

(2, 3, 2, 4)

## Concatenate

In [52]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [53]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[1,:]))

# The concatenated array has the same number of dimensions as the inputs. 

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.,  2000.,    40.,
         365.,  3061.,  4171., 15041.])

In [54]:
# Recall

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
lending_co_data_numeric_NAN


## We create a filler, reimport and fill all the nan-s, then subsitute all the temporary fillers with more appropriate values

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [56]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN))
#we have concatenated the second array at the bottom of the first array 

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [57]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape
#check size / shape

(2086, 6)

In [58]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1)
#we have concatenated the arrays side byt side. New array has 12 columns

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

In [59]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1).shape

(1043, 12)

In [61]:
#Error
#np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 2)
#error because the inputs have 2 dimensions

In [62]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We create 3-D arrays to showcase concatenate vs stacking

In [64]:
np.concatenate((array_example_1, array_example_2), axis = 0)

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 30, 31, 32]],

       [[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 60, 62, 64]]])

In [67]:
np.vstack((array_example_1, array_example_2))
#np.concatenate((array_example_1, array_example_2), axis = 0) same result both AXIS =0

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 30, 31, 32]],

       [[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 60, 62, 64]]])

In [70]:
#np.hstack((array_example_1, array_example_2))
np.concatenate((array_example_1, array_example_2), axis = 1)

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 30, 31, 32],
        [42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 60, 62, 64]]])

In [71]:
#np.dstack((array_example_1, array_example_2))
np.concatenate((array_example_1, array_example_2), axis = 2)


array([[[ 1,  2,  3,  4,  2,  4,  6,  8],
        [ 5,  6,  7,  8, 10, 12, 14, 16],
        [ 9, 10, 11, 12, 18, 20, 22, 24]],

       [[21, 22, 23, 24, 42, 44, 46, 48],
        [25, 26, 27, 28, 50, 52, 54, 56],
        [29, 30, 31, 32, 58, 60, 62, 64]]])

In [72]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[:,0] ))

array([2000.,   40.,  365., ..., 2000., 1000., 2000.])

## Unique 

In [73]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [76]:
np.unique(lending_co_data_numeric)

array([-2870., -2550., -2450., ..., 52751., 54625., 64001.])

In [75]:
np.unique(lending_co_data_numeric[:,1], return_counts = True, return_index = True)

# Unique -> returns the unique values within the array in increasing order
# return_counts -> returns how many times each unique value appears in the array
# return_index -> returns the index of the first encounter with each unique value

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27], dtype=int64),
 array([  4, 567, 451,  19,   2], dtype=int64))

In [79]:
np.unique(lending_co_data_numeric[:,1])

array([ 35.,  40.,  50., 125., 165.])

In [82]:
array_example = np.array(["a1", "a3","A1","A3","A3","AA1","B1","A2","B1","A2","B2","B2", "B3","a2","a3","B3","B3","a3" ])
np.unique(array_example)

array(['A1', 'A2', 'A3', 'AA1', 'B1', 'B2', 'B3', 'a1', 'a2', 'a3'],
      dtype='<U3')

In [83]:
np.unique(lending_co_data_numeric[:,1], return_counts = True)

(array([ 35.,  40.,  50., 125., 165.]),
 array([  4, 567, 451,  19,   2], dtype=int64))

In [84]:
np.unique(lending_co_data_numeric[:,1], return_counts = True, return_index = True)

# Unique -> returns the unique values within the array in increasing order
# return_counts -> returns how many times each unique value appears in the array
# return_index -> returns the index of the first encounter with each unique value

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27], dtype=int64),
 array([  4, 567, 451,  19,   2], dtype=int64))