## Importing Data with NumPy

In [1]:
import numpy as np

### np.loadtxt() vs np.genfromtxt()
- These are the 2 functions which are used for working with data and pull them into the code to work with the different data files.
- Load implies the data is ready to be directly imported and used.
- Generate indicates that the function creates datase from the text file. 
- Generating requires constructing the array as we go through the text file.
- While loading we use delimieters which are pre-defined symbols which are used to define distinct fields in text files. This is a very common approach to store large datasets in text files this way.

In [12]:
# This one is a faster method but it breaks when it encounters strings or NAN values, because the default dtype is of int.
lending_co_data_numeric_1 = np.loadtxt(".\Data\Lending-Company-Numeric-Data.csv",delimiter = ',')
type(lending_co_data_numeric_1)
print(lending_co_data_numeric_1.shape)
lending_co_data_numeric_1

(1043, 6)


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [14]:
# This one is a slower method but it handles the NAN values well.
lending_co_data_numeric_2 = np.genfromtxt(".\Data\Lending-Company-Numeric-Data.csv",delimiter = ',')
type(lending_co_data_numeric_2)
print(lending_co_data_numeric_2.shape)
lending_co_data_numeric_2

(1043, 6)


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [16]:
# Thus both the arrays are equal
np.array_equal(lending_co_data_numeric_1,lending_co_data_numeric_2)

True

In [29]:
# For handling NaN values with np.loadtxt we need to set dtype to str, else it gives an error on finding an empty 
# string in arrays of numeric values.
lending_co_data_numeric_1 = np.loadtxt(".\Data\Lending-Company-Numeric-Data-NAN.csv",delimiter = ';',dtype = str)
print((lending_co_data_numeric_1[:,:1]))


print(type(lending_co_data_numeric_1))
print(lending_co_data_numeric_1.shape)
lending_co_data_numeric_1
# Observe '' in the 3rd last row, that represents a Not a Number value or NaN and is stored as empty string.

[['2000']
 ['2000']
 ['1000']
 ...
 ['']
 ['1000']
 ['2000']]
<class 'numpy.ndarray'>
(1043, 6)


array([['2000', '40', '365', '3121', '4241', '13621'],
       ['2000', '40', '365', '3061', '4171', '15041'],
       ['1000', '40', '365', '2160', '3280', '15340'],
       ...,
       ['', '40', '365', '4201', '5001', '16600'],
       ['1000', '40', '365', '2080', '3320', '15600'],
       ['2000', '40', '365', '4601', '4601', '16600']], dtype='<U5')

In [23]:
# Handling NaN values with np.genfromtxt, is done automatically and we see a NaN value instead of an error.
lending_co_data_numeric_2 = np.genfromtxt(".\Data\Lending-Company-Numeric-Data-NAN.csv",delimiter = ';')
type(lending_co_data_numeric_2)
print(lending_co_data_numeric_2.shape)
lending_co_data_numeric_2
# Observe '' in the 3rd last row, that represents a Not a Number value or NaN and is stored as empty string.

(1043, 6)


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [24]:
# So for summary use the loadfromtext value with dtype as str to only view the data and use genfromtext to load 
# the values for mathematical computations.

### Partial Cleaning While Importing
- In partial cleaning we try to omit some parts of the data as per our need and 
- **Remember this is done while importing not after importing.**
- This is done when we are totally aware of our data and want to use only certain parts of the data.
- For this we use skip_header,skip_footer, usecols arguments in the method of loading the data.
- **Most important part is we can unpack each columns to separate variables individually.**

In [72]:
d_1,d_2,d_3 = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv',
                            delimiter = ';',
                            unpack = True,
                           usecols = (1,2,5))
print(d_1)
print(d_2)
print(d_3)

[40. 40. 40. ... 40. 40. 40.]
[365. 365. 365. ... 365. 365. 365.]
[13621. 15041. 15340. ... 16600. 15600. 16600.]


In [36]:
lending_co_data_numeric_NaN = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv',delimiter = ';')
lending_co_data_numeric_NaN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
lending_co_data_numeric_NaN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [41]:
# Skipping 2 rows from the top while importing the data.
lending_co_data_numeric_NaN = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv',
                                            delimiter = ';',
                                            skip_header = 2)
lending_co_data_numeric_NaN

array([[ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [42]:
# Skipping 2 rows from the bottom while importing the data.
lending_co_data_numeric_NaN = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv',
                                            delimiter = ';',
                                            skip_footer = 2)
lending_co_data_numeric_NaN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  3401.,    nan, 16600.],
       [ 2000.,    40.,   365.,    nan,  5440., 16600.],
       [   nan,    40.,   365.,  4201.,  5001., 16600.]])

In [48]:
# Using the only columns which are specified in form of tuple as an argument.
lending_co_data_numeric_NaN = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv'
                                            ,delimiter = ';',
                                            usecols = 4) # Here we are using the 5th column cause of 0-indexing in python.
lending_co_data_numeric_NaN

array([4241., 4171., 3280., ..., 5001., 3320., 4601.])

In [52]:
# Using the only columns which are specified in form of tuple as an argument.
# Also remember the order stays preserved while importing
lending_co_data_numeric_NaN = np.genfromtxt('./Data/Lending-Company-Numeric-Data-NAN.csv'
                                            ,delimiter = ';',
                                            usecols = (4,0,1)) # Here we are using the 5th column cause of 0-indexing in python.
lending_co_data_numeric_NaN

array([[4241., 2000.,   40.],
       [4171., 2000.,   40.],
       [3280., 1000.,   40.],
       ...,
       [5001.,   nan,   40.],
       [3320., 1000.,   40.],
       [4601., 2000.,   40.]])

### String vs Object vs Numbers
- We can import the data on the files in different data forms with the dtype argument.
- This totally depends on the need of the user.

In [54]:
lending_co_LT = np.genfromtxt('./Data/lending-co-LT.csv',delimiter = ',')
lending_co_LT

array([[      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [1.000e+00,       nan,       nan, ...,       nan,       nan,
        1.660e+04],
       [2.000e+00,       nan,       nan, ...,       nan,       nan,
        1.660e+04],
       ...,
       [1.041e+03,       nan,       nan, ...,       nan,       nan,
        1.660e+04],
       [1.042e+03,       nan,       nan, ...,       nan,       nan,
        1.560e+04],
       [1.043e+03,       nan,       nan, ...,       nan,       nan,
        1.660e+04]])

In [60]:
# To avoid the scientific values we can import the data in form of integers.
# But remember that with this the mathematical computations become possible and thus we can end up adding the 
# NaN values which are -1 and make no sense on addition.
lending_co_LT = np.genfromtxt('./Data/lending-co-LT.csv',delimiter = ',',dtype = int)
print(lending_co_LT)
print()
print(lending_co_LT[0,0]+lending_co_LT[0,1]) # Not sensible operation.

[[   -1    -1    -1 ...    -1    -1    -1]
 [    1    -1    -1 ...    -1    -1 16600]
 [    2    -1    -1 ...    -1    -1 16600]
 ...
 [ 1041    -1    -1 ...    -1    -1 16600]
 [ 1042    -1    -1 ...    -1    -1 15600]
 [ 1043    -1    -1 ...    -1    -1 16600]]

-2


In [61]:
# We can also get the output in form of strings.
lending_co_LT = np.genfromtxt('./Data/lending-co-LT.csv',delimiter = ',',dtype = str)
print(lending_co_LT)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [80]:
# We can specify a different dtype for every column or field.
lending_co_LT = np.genfromtxt('./Data/lending-co-LT.csv',
                              delimiter = ',',
                              usecols = (5,0,1),
                              dtype = [np.int32,str,np.float64])
print(lending_co_LT)

[(-1, '', nan) (-1, '', nan) (-1, '', nan) ... (-1, '', nan) (-1, '', nan)
 (-1, '', nan)]


In [82]:
# We can also get the output in form of objects.
lending_co_LT = np.genfromtxt('./Data/lending-co-LT.csv',delimiter = ',',dtype = object)
print(lending_co_LT)
#b represents that its of the object type.

[[b'LoanID' b'StringID' b'Product' ... b'Location' b'Region'
  b'TotalPrice']
 [b'1' b'id_1' b'Product B' ... b'Location 2' b'Region 2' b'16600.0']
 [b'2' b'id_2' b'Product B' ... b'Location 3' b'' b'16600.0']
 ...
 [b'1041' b'id_1041' b'Product B' ... b'Location 23' b'Region 4'
  b'16600.0']
 [b'1042' b'id_1042' b'Product C' ... b'Location 52' b'Region 6'
  b'15600.0']
 [b'1043' b'id_1043' b'Product B' ... b'Location 142' b'Region 6'
  b'16600.0']]
