## Importing Data with NumPy

In [24]:
import numpy as np

### np.loadtxt() vs np.genfromtxt()

In [25]:
lending_co_data_numeric_1 = np.loadtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_1

# We can use Notepad++ to determine delimiters

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [26]:
lending_co_data_numeric_2 = np.genfromtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_2

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [27]:
np.array_equal(lending_co_data_numeric_1, lending_co_data_numeric_2)

## np.array_equal() compares two (or more) arrays and tells us if whether they're identical

True

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt(
    "Lending-Company-Numeric-Data-NAN.csv", delimiter=';')
lending_co_data_numeric_NAN

# np.loadtxt() fails to import incomplete datasets by default

ValueError: could not convert string '' to float64 at row 11, column 4.

In [31]:
lending_co_data_numeric_NAN = np.loadtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                         delimiter = ';',
                                         dtype = str)
lending_co_data_numeric_NAN

# If we import all the values as text, then we don't get a type inconcsistency, so we can use np.loadtxt()

array([['2000', '40', '365', '3121', '4241', '13621'],
       ['2000', '40', '365', '3061', '4171', '15041'],
       ['1000', '40', '365', '2160', '3280', '15340'],
       ...,
       ['', '40', '365', '4201', '5001', '16600'],
       ['1000', '40', '365', '2080', '3320', '15600'],
       ['2000', '40', '365', '4601', '4601', '16600']], dtype='<U5')

In [32]:
lending_co_data_numeric_NAN[0,0] + lending_co_data_numeric_NAN[0,1]

# Adding '2000' and '40' results in a concatenated '200040' rather than 2040.

'200040'

### Partial Cleaning While Importing

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';') 
lending_co_data_numeric_NAN

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            skip_header = 2) 
lending_co_data_numeric_NAN

# skip_header omits lines from the top of the text file

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            skip_footer = 2) 
lending_co_data_numeric_NAN

# skip_footer omits lines from the bottom of the text file

In [33]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            usecols = (5,0,1)) 
lending_co_data_numeric_NAN

# use_cols tells the function to only take the following columns based on their indices.

array([[13621.,  2000.,    40.],
       [15041.,  2000.,    40.],
       [15340.,  1000.,    40.],
       ...,
       [16600.,    nan,    40.],
       [15600.,  1000.,    40.],
       [16600.,  2000.,    40.]])

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            usecols = (5,0,1), 
                                            skip_header = 2, 
                                            skip_footer = 2) 
lending_co_data_numeric_NAN

# We can define all these arguments (and many more) together to only import what we want. 

In [34]:
lending_co_data_5, lending_co_data_0, lending_co_data_1 = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                                                        delimiter = ';',
                                                                        usecols = (5,0,1), 
                                                                        skip_header = 2, 
                                                                        skip_footer = 2, 
                                                                        unpack = True)
print(lending_co_data_5)
print(lending_co_data_0)
print(lending_co_data_1)

# Unpacking allows us to split the output array into smaller 1-D arrays.

[15340. 15321. 13720. ... 16600. 16600. 16600.]
[1000. 2000. 2000. ... 2000. 2000.   nan]
[40. 40. 50. ... 40. 40. 40.]


### String vs Object vs Numbers

In [35]:
lending_co_lt = np.genfromtxt("lending-co-LT.csv", 
                              delimiter = ',',
                            #   dtype = np.int32
                            #   dtype = np.float16
                              dtype = str
                              #dtype = np.object
                              #dtype = (np.int32, np.str, np.str, np.str, np.str, np.str, np.int32)
                             )
print(lending_co_lt)

# The same dataset is imported differently based on the datatype we define. 

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [36]:
lending_co_lt[0,0] + lending_co_lt[0,1]

'LoanIDStringID'