# Preprocessing with NumPy
Preprocessing with NumPy is an essential step in preparing data for analysis, machine learning, or statistical modeling. It involves cleaning, transforming, and organizing data to ensure it's in a format that's ready for further use. NumPy, with its array-based operations, is a powerful tool for performing data preprocessing tasks efficiently.



In [2]:
import numpy as np

## Loading Data
You can load data from various sources using NumPy.

In [4]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN-short.csv",
                                            delimiter=',', skip_header=1)
print(lending_co_data_numeric_NAN)

[nan nan nan]


## Checking for Missing Values

In [3]:
import numpy as np

# Example data
lending_co_data_numeric = np.array([[1, 2, np.nan],
                                    [4, np.nan, 6],
                                    [7, 8, 9]])

# Counting NaN values
num_nans = np.isnan(lending_co_data_numeric).sum()

print(f"Number of NaN values: {num_nans}")

Number of NaN values: 2


In [4]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN-short.csv",
                                            delimiter = ';',
                                            filling_values = 0)

## Filling_values substitutes every nan with the value we're passing (0 in this case)

In [5]:
np.isnan(lending_co_data_numeric_NAN).sum()

## All the previously missing values are now 0s.

np.int64(0)

In [8]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN-short.csv",
                                            delimiter = ';') 

# We need to reimport the dataset since all the missing values are filled up. 

In [6]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
# np.nanmax() calculates the maximum value of the array while ignoring any NaN values.
# This means that if lending_co_data_numeric_NAN contains missing values (NaN), 
# they are not considered when calculating the maximum value. Only the valid numbers are used.

# round(2) This step ensures that the maximum value is rounded to two decimal places,
#  making it easier to handle in further calculations.

# We use nanmax(), since max() returns nan. 
# We want a value greater than the max, since we have be certain it's unique to the dataset.

In [7]:
temporary_fill

np.float64(12.0)

In [8]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN-short.csv",
                                            delimiter = ';',
                                            filling_values = temporary_fill) 

# Filling up all the missing values with the temporary filler. 

In [9]:
np.isnan(lending_co_data_numeric_NAN)

array([[False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

In [10]:
np.isnan(lending_co_data_numeric_NAN).sum()

np.int64(0)

In [11]:
lending_co_data_numeric_NAN

array([[ 1.,  2.,  3.],
       [ 4., 12.,  6.],
       [ 7.,  8., 12.],
       [12., 10., 11.]])

## Substituting Missing Values

In [14]:
lending_co_data_numeric_NAN = np.genfromtxt(
    "Lending-Company-Numeric-Data-NAN-short.csv", delimiter=';')
lending_co_data_numeric_NAN

array([[ 1.,  2.,  3.],
       [ 4., nan,  6.],
       [ 7.,  8., nan],
       [nan, 10., 11.]])

In [15]:
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

## Storing the means of every column. 

In [16]:
temporary_mean[0]

4.0

In [17]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1 # max value

lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN-short.csv",
                                            delimiter = ';',
                                            filling_values = temporary_fill)

## Creating a unique filler and using it to take care of all the missing values.

In [18]:
temporary_fill

12.0

In [None]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2) 
# calculate the mean of the first column of a lending_co_data_numeric_NAN),
#  while rounding the result to two decimal places



# Supposed mean (w/ fillers)

In [None]:
temporary_mean[0]

# Actual mean (w/0 fillers)

np.where() is a conditional function in NumPy that returns elements chosen from two options:

If the condition is True, it selects the value x.

If the condition is False, it selects the value y.

In [None]:
lending_co_data_numeric_NAN[:,0] = np.where(lending_co_data_numeric_NAN[:,0] == temporary_fill,
                                            temporary_mean[0], 
                                            lending_co_data_numeric_NAN[:,0])

# lending_co_data_numeric_NAN[:, 0]:

# Refers to the first column of the lending_co_data_numeric_NAN array.
# The : selects all rows, and 0 specifies the first column.

# np.where(condition, x, y):
# A NumPy function that works like a vectorized "if-else".
# Condition: Checks for a specific condition (e.g., missing or placeholder values).
# x: The value to assign if the condition is True.
# y: The value to retain if the condition is False.

# temporary_mean[0]:

# The mean value calculated for the first column.
# Used to replace the placeholder value (temporary_fill) where the condition is True.

# lending_co_data_numeric_NAN[:,0]:

# If the condition is False, the original value in the column is retained.
# Going through the first column and substituting any temporary fillers (previously missing) 
# with the mean for that column.

# data replacement operation on the first column of the lending_co_data_numeric_NAN array. 
# Specifically, it replaces values in the first column that match a certain value (temporary_fill) with a new value (temporary_mean[0]). 

# This code checks each element in the first column of lending_co_data_numeric_NAN. If an element matches temporary_fill, it will be replaced with the value temporary_mean[0]. Otherwise, it will remain unchanged.

## Reshaping
In reshaping, NumPy simply flattens the original array and fills the new shape (6 rows and 1043 columns) in row-major order.

Reshaping an array means changing the shape of the array without modifying its data, but it involves reordering the elements of the array to fit the new shape.
In reshaping, the elements are rearranged in the order they appear, which can result in a different layout of data in memory.
Reshaping can change the number of rows and columns (or more generally, the shape of the array) but doesn't guarantee the same structure of data as before.


In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

In [None]:
lending_co_data_numeric

In [None]:
lending_co_data_numeric.shape

In [None]:
np.reshape(lending_co_data_numeric, (6,1043))

# Reshaping (1043,6) to (6,1043) is not the same as transposing.

In [None]:
np.transpose(lending_co_data_numeric)

# **Reshaping vs Transposing in NumPy**

Reshaping and transposing a NumPy array are not the same operations. While both modify the structure of the array, they do so in different ways.

---

## **Key Differences**

1. **Reshaping**:
   - Changes the shape of the array by rearranging its elements in row-major order.
   - Does not preserve the relative positions of elements as they were in the original array.

2. **Transposing**:
   - Flips the rows and columns of the array (or swaps axes for higher dimensions).
   - Preserves the relative positions of elements.

---

## **Example**



In [None]:
import numpy as np
arr = np.array([[1, 2, 3,4,5,500],
                [4, 5, 6,7,8,800]])
print(arr.shape)
reshaped = arr.reshape(3, 4)
print("reshaped")

print(reshaped)
print("transposed")

transposed = reshaped.T
print(transposed)

In [None]:
lending_co_data_numeric

# Reshaping doesn't alter the original array. 

In [None]:
lending_co_data_numeric_2 = np.reshape(lending_co_data_numeric, (6,1043))
lending_co_data_numeric_2

In [None]:
lending_co_data_numeric.reshape(6,1043)

# Equivalent method. 

In [None]:
lending_co_data_numeric

## Removing Values

In [58]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 

In [59]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [60]:
lending_co_data_numeric.shape

(1043, 6)

In [65]:
lending_co_data_numeric1=np.delete(lending_co_data_numeric, 0)
print(lending_co_data_numeric1)
print(lending_co_data_numeric1.shape)



# If you do not specify the axis parameter,
#  np.delete() will flatten the array before performing the deletion.
#  This means that the entire array is considered as a 1D array, 
# and the deletion will remove an element from this 1D version of the array.



# As a result, the array is flattened, and the first element (index 0) is removed, which will affect the total number of elements in the array. The total number of elements decreases by one.



# Removes the first value of the flattened array. 

[   40.   365.  3121. ...  4601.  4601. 16600.]
(6257,)


In [None]:
lending_co_data_numeric.size

In [None]:
lending_co_data_numeric

In [70]:
arr = np.array([[1, 2, 3, 4, 5, 500],
                [4, 5, 6, 7, 8, 800],
                [400, 59, 26, 7, 8, 800],
                [34, 533, 6, 7, 8, 800]])
arr1=np.delete(arr, [0, 2], axis=0)

print(arr1)

# By setting an axis, we can simultaneously delete entire rows or columns. 
# [0, 2, 4]:
# A list of column indices to delete. Here, columns at indices 0, 2, and 4 are targeted.

[[  4   5   6   7   8 800]
 [ 34 533   6   7   8 800]]


In [66]:
np.delete(np.delete(lending_co_data_numeric, [0,2,4] , axis = 1), [0,2,-1] , axis = 0)

# We can simultaneously delete rows AND columns. 

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

## Sorting Data

In [2]:
x = np.array([2, 1, 4, 3, 5])
np.sort(x)

array([1, 2, 3, 4, 5])

### Sorting along rows or columns
A useful feature of NumPy's sorting algorithms is the ability to sort along specific rows or columns of a multidimensional array using the ``axis`` argument. For example:

In [3]:
# np.random.RandomState() constructs a random number generator. It does not have any effect
rand = np.random.RandomState(42)
# on the freestanding functions in np.random, but must be used explicitly:

X = rand.randint(0, 10, (4, 6))  # gives the start,end,vector dimension
print(X)

[[6 3 7 4 6 9]
 [2 6 7 4 3 7]
 [7 2 5 4 1 7]
 [5 1 4 0 9 5]]


In [4]:
# sort each column of X
np.sort(X, axis=0)

array([[2, 1, 4, 0, 1, 5],
       [5, 2, 5, 4, 3, 7],
       [6, 3, 7, 4, 6, 7],
       [7, 6, 7, 4, 9, 9]])

In [5]:
# sort each row of X
np.sort(X, axis=1)

array([[3, 4, 6, 6, 7, 9],
       [2, 3, 4, 6, 7, 7],
       [1, 2, 4, 5, 7, 7],
       [0, 1, 4, 5, 5, 9]])

In [6]:
np.sort(X, axis=None)
# Flatten the array and sort

array([0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7,
       9, 9])

## Partial Sorts: Partitioning

Sometimes we're not interested in sorting the entire array, but simply want to find the *k* smallest values in the array. NumPy provides this in the ``np.partition`` function. ``np.partition`` takes an array and a number *K*; the result is a new array with the smallest *K* values to the left of the partition, and the remaining values to the right, in arbitrary order:

numpy.partition() function is used to create a partitioned copy of input array with its elements rearranged in such a way that the value of the element in k-th position is in the position it would be in a sorted array. All elements smaller than the k-th element are moved before this element and all equal or greater are moved behind it. The ordering of the elements in the two partitions is undefined.

In [76]:
x = np.array([7, 2, 3, 1, 6, 5, 4])
np.partition(x, 5)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [None]:
np.partition(X, 2, axis=1)  # rows will be sorted

[[500   5   4   3   2   1]
 [800   8   7   6   5   4]
 [800 400  59  26   8   7]
 [800 533  34   8   7   6]]


In [86]:
lending_co_data_numeric

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [95]:
arr = np.array([[1, 2, 3, 4, 5, 500],
                [4, 5, 6, 7, 8, 800],
                [400, 59, 26, 7, 8, 800],
                [34, 533, 6, 7, 8, 800]])
# arr2 = arr.sort(axis = 0)# column sort
print(arr)
print("*"*5)
print(np.sort(arr,axis=1))
# The equivalent method stores the values in place. 

[[  1   2   3   4   5 500]
 [  4   5   6   7   8 800]
 [400  59  26   7   8 800]
 [ 34 533   6   7   8 800]]
*****
[[  1   2   3   4   5 500]
 [  4   5   6   7   8 800]
 [  7   8  26  59 400 800]
 [  6   7   8  34 533 800]]


## Argument Functions

### np.argsort()
# **`np.argsort()`**:

The **`np.argsort()`** function in NumPy is used to get the indices that would sort an array. Instead of returning the sorted values, it provides the positions of the elements in the sorted order.

---

## **Syntax**
```python
numpy.argsort(a, axis=-1, kind=None, order=None)


In [96]:
lending_co_data_numeric =  np.array([[1, 2, 3, 4, 5, 500],
                                          [4, 5, 6, 7, 8, 800],
                                          [400, 59, 26, 7, 8, 800],
                                          [34, 533, 6, 7, 8, 800]])

lending_co_data_numeric

array([[  1,   2,   3,   4,   5, 500],
       [  4,   5,   6,   7,   8, 800],
       [400,  59,  26,   7,   8, 800],
       [ 34, 533,   6,   7,   8, 800]])

In [97]:
np.argsort(lending_co_data_numeric) # rows

# np.argsort() is a function in NumPy that returns the indices that would sort an array.
#  In other words, it does not sort the array itself, but instead, 
# it provides the indices that can be used to sort the array.

# The resulting array from np.argsort() is a 1D array of indices,
#  and these indices represent the order in which the original elements would 
# need to be arranged in ascending order.

# Returns the order which will sort the array. 

array([[0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       [3, 4, 2, 1, 0, 5],
       [2, 3, 4, 0, 1, 5]])

In [98]:
np.sort(lending_co_data_numeric, axis = 0)

array([[  1,   2,   3,   4,   5, 500],
       [  4,   5,   6,   7,   8, 800],
       [ 34,  59,   6,   7,   8, 800],
       [400, 533,  26,   7,   8, 800]])

In [99]:
np.argsort(lending_co_data_numeric, axis = 0)
#  For each column, np.argsort(axis=0) returns the row indices
#  that would arrange the column elements in ascending order.

array([[0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1],
       [3, 2, 3, 2, 2, 2],
       [2, 3, 2, 3, 3, 3]])

In [None]:
lending_co_data_numeric[482,5]

In [None]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric

# Sorts the array based on the values in the 1st column. 

In [None]:
lending_co_data_numeric.argsort(axis = 0)

# The method doesn't sort in place. 
# For each column, np.argsort(axis=0) returns the row indices
#  that would arrange the column elements in ascending order.

In [None]:
lending_co_data_numeric

### np.argwhere()
# **`np.argwhere()`**: Explanation

The **`np.argwhere()`** function in NumPy is used to find the indices of elements in an array that satisfy a given condition. It returns the indices of the matching elements in a structured format.

---



In [102]:
lending_co_data_numeric = np.array([[1, 2, 3, 4, 5, 500],
                                          [4, 5, 6, 7, 8, 800],
                                          [400, np.nan, 0, 7, 8, 800],
                                          [34, 533, 6, 7, 8, 800]])

lending_co_data_numeric

array([[  1.,   2.,   3.,   4.,   5., 500.],
       [  4.,   5.,   6.,   7.,   8., 800.],
       [400.,  nan,   0.,   7.,   8., 800.],
       [ 34., 533.,   6.,   7.,   8., 800.]])

In [103]:
np.argwhere(lending_co_data_numeric == False)

# Default condition is to return values are false 0. 

array([[2, 2]])

In [None]:
lending_co_data_numeric[430]

In [None]:
lending_co_data_numeric

In [104]:
np.argwhere(lending_co_data_numeric %2 == 0)

# The condition can be more complex 

array([[0, 1],
       [0, 3],
       [0, 5],
       [1, 0],
       [1, 2],
       [1, 4],
       [1, 5],
       [2, 0],
       [2, 2],
       [2, 4],
       [2, 5],
       [3, 0],
       [3, 2],
       [3, 4],
       [3, 5]])

In [None]:
np.isnan(lending_co_data_numeric).sum()

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';') 
lending_co_data_numeric_NAN

In [106]:
lending_co_data_numeric = np.array([[1, 2, 3, 4, 5, 500],
                                    [4, 5, 6, 7, 8, 800],
                                    [400, np.nan, 0, 7, 8, 800],
                                    [34, 533, 6, 7, 8, 800]])
np.argwhere(np.isnan(lending_co_data_numeric))



# Returns the coordinates of all the missing values within the array. 

array([[2, 1]])

In [107]:
lending_co_data_numeric_NAN[175]

array([ 2000., 64002., 64002.,  1851.,  3051., 13561.])

In [None]:
# for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
#     lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0

## By going through the coordinates of all the mising values of the array, we can fill them up. 

In [None]:
lending_co_data_numeric_NAN[175]

In [None]:
np.isnan(lending_co_data_numeric_NAN).sum()

## Shuffling Data

Shuffling data is a common step in data preprocessing to ensure that the order of the data does not influence the outcome of your model or analysis. NumPy provides tools for shuffling arrays easily.

---

## **Shuffling with `np.random.shuffle()`**

### **Syntax**
```python
np.random.shuffle(arr)


In [7]:
import numpy as np

# Create a 1D array
arr = np.array([1, 2, 3, 4, 5])

# Shuffle the array
np.random.shuffle(arr)

print(arr)

[1 3 4 2 5]


In [8]:
import numpy as np

# Create a 2D array
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Shuffle the array
np.random.shuffle(arr)

print(arr) #Output (rows shuffled, columns intact):




[[4 5 6]
 [7 8 9]
 [1 2 3]]


You can transpose the array, shuffle the rows of the transposed array (which correspond to the columns of the original array), and then transpose it back.


In [9]:
import numpy as np

# Create a 2D array
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Shuffle the columns
np.random.shuffle(arr.T)

print(arr)

[[3 1 2]
 [6 4 5]
 [9 7 8]]


## Casting

In [111]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [112]:
lending_co_data_numeric.astype(dtype = np.int32)

# Creates an integer version of the array. 

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [113]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = str)

# We need to overwrite the variable in order to work with strings. 

In [114]:
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [None]:
type(lending_co_data_numeric)

In [115]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.float32)
lending_co_data_numeric.astype(dtype = np.int32)

## We can't directly cast strings to integers. We can go through floats (string -> float -> integer).

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [117]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = str)
lending_co_data_numeric

# To showcase the other way to go from strings to integers,
#  we need to get the strings version of the array once again. 

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [None]:
lending_co_data_numeric.astype(dtype = np.float32).astype(dtype = np.int32)
lending_co_data_numeric

## We can chain methods in NumPy.

## Stripping Data

In [None]:
import numpy
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv",
                                       delimiter = ',',
                                       dtype = str,
                                       skip_header = 1, 
                                       usecols = [1,2,4])
lending_co_total_price

# We don't neeed the entire array. We only want a few columns to showcase how stripping data works.

In [118]:
lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")
lending_co_total_price

# Remove "id_" from the 1st column, as well as "Product " from the second and "Location " from the third one. 

  lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
  lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
  lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")


array([['1', 'B', '2'],
       ['2', 'B', '3'],
       ['3', 'C', '5'],
       ...,
       ['413', 'B', '135'],
       ['414', 'C', '200'],
       ['415', 'A', '8']], dtype='<U12')

## Unique 

In [128]:
import numpy as np

lending_co_data_numeric = np.array([
    [1, 2, 3],
    [4, 2, 6],
    [7, 8, 9],
    [10, 8, 12],
    [13, 2, 15]
])

In [129]:
unique_values, indices, counts = np.unique(
    lending_co_data_numeric[:, 1], return_counts=True, return_index=True)
print("Unique Values:", unique_values)
print("Indices:", indices)
print("Counts:", counts)


# Unique -> returns the unique values within the array in increasing order

# return_counts -> returns how many times each unique value appears in the array

# return_index -> returns the index of the first encounter with each unique value

Unique Values: [2 8]
Indices: [0 2]
Counts: [3 2]


In [130]:
array_example = np.array(["a1", "a3","A1","A3","A3","AA1","B1","A2","B1","A2","B2","B2", "B3","a2","a3","B3","B3","a3" ])
np.unique(array_example)

# If the values of the array are text, the unique function sorts them in "alphabetical" order by their ASCII codes. 

array(['A1', 'A2', 'A3', 'AA1', 'B1', 'B2', 'B3', 'a1', 'a2', 'a3'],
      dtype='<U3')