## Dimension Reduction

In [1]:
import numpy as np
from sklearn.decomposition import PCA
# Accept simple array as well as numpy array
# X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
X = [[-1, 2], 
     [2, 6], 
     [2, 10], 
     [1, 18]]
X

[[-1, 2], [2, 6], [2, 10], [1, 18]]

In [2]:
np.array(X).shape

(4, 2)

In [3]:
pca = PCA(n_components=2)
pca.fit(X)

In [4]:
print(pca.explained_variance_)       # The amount of variance explained by each of the selected components.

[47.0220481   1.64461857]


In [5]:
print(pca.explained_variance_ratio_) # Percentage of variance explained by each of the selected components.

[0.96620647 0.03379353]


In [6]:
#pca_X = pca.fit_transform(X) # Equal to fit and then transform..
pca_X = pca.transform(X) #  # The method fit already called above, now using the already computed model.
pca_X

array([[-7.14952879,  1.37267553],
       [-2.89973262, -1.26156678],
       [ 1.08457323, -0.90757969],
       [ 8.96468817,  0.79647093]])

In [7]:
## Calculating Standard Deviation and Variance using NumPy
## pca_X[:,0].std()
print(np.var(pca_X[:,0], ddof=1))
print(np.var(pca_X[:,1], ddof=1))

47.0220481003222
1.644618566344487


In [8]:
X = pca.inverse_transform(pca_X)
X

array([[-1.,  2.],
       [ 2.,  6.],
       [ 2., 10.],
       [ 1., 18.]])

### svd_solver{‘auto’, ‘full’, ‘arpack’, ‘randomized’}, default=’auto’
### Singular Value Decomposition (SVD) 

    If auto :
    The solver is selected by a default policy based on X.shape and n_components: if the input data is larger 
    than 500x500 and the number of components to extract is lower than 80% of the smallest dimension of the 
    data, then the more efficient ‘randomized’ method is enabled. Otherwise the exact full SVD is computed and 
    optionally truncated afterwards.
    
    If full :
    run exact full SVD calling the standard LAPACK solver via scipy.linalg.svd and select the components by 
    postprocessing
    
    If arpack :
    run SVD truncated to n_components calling ARPACK solver via scipy.sparse.linalg.svds. It requires strictly 0 
    < n_components < min(X.shape)
    
    If randomized :
        run randomized SVD by the method of Halko et al.


In [9]:
pca = PCA(n_components=2)
pca.fit(X)
pca_X = pca.transform(X)
pca_X

array([[-7.14952879,  1.37267553],
       [-2.89973262, -1.26156678],
       [ 1.08457323, -0.90757969],
       [ 8.96468817,  0.79647093]])

In [10]:
#If svd_solver='full' then default value for n_components = min(n_samples, n_features)

#X = [[-1, 2]]
pca = PCA(n_components=1, svd_solver='full')
pca.fit(X)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

pca_X = pca.transform(X)
pca_X

[47.0220481]
[0.96620647]


array([[-7.14952879],
       [-2.89973262],
       [ 1.08457323],
       [ 8.96468817]])

In [11]:
#If svd_solver='auto' 
pca = PCA(n_components=2, svd_solver='auto')
pca.fit(X)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

pca_X = pca.transform(X)
pca_X

[47.0220481   1.64461857]
[0.96620647 0.03379353]


array([[-7.14952879,  1.37267553],
       [-2.89973262, -1.26156678],
       [ 1.08457323, -0.90757969],
       [ 8.96468817,  0.79647093]])

In [12]:
pca = PCA(n_components=1, svd_solver='arpack') #  Must be strictly less than min(n_samples, n_features)=2 
pca.fit(X)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

pca_X = pca.transform(X)
pca_X

[47.0220481]
[0.96620647]


array([[-7.14952879],
       [-2.89973262],
       [ 1.08457323],
       [ 8.96468817]])

## Transformations

### One Hot Encoding

In [13]:
# In case we have Gender values i.e. 'Male' and 'Female' Then 
# One hot Encoding for Female = 1 0 and Male = 0 1

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')# If handle_unknown='ignore' then sample which is missing 
                                            # during training, needed to tranform, results all bits zeros.
X = [['Male', 1], 
     ['Female', 3], 
     ['Female', 2]]
enc.fit(X)

In [14]:
# Shows Two Categories 
# Category-1 (Gender Column) => ['Male', 'Female'] and
# Category-2 (Number Column) => [1, 2, 3]
enc.categories_


[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [15]:
# Here we are transforming the following two records to one hot encoding
# As One Hot Encoding For Gender Column: Female => 1 0, and For Male => 0 1
# As One Hot Encoding For Number Column: 1 => 1 0 0, For 2 => 0 1 0 and For 3 => 0 0 1

enc.transform([['Female', 2], ['Male', 4]]).toarray()

# Thus ['Female', 1] = 1 0 1 0 0 As Female = 1 0, and 1 = 1 0 0
# While ['Male', 4]  = 0 1 0 0 0 As Male   = 0 1, and 4 = 0 0 0 (As 4 is missing while fitting)


array([[1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.]])

In [16]:
enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [1, 0, 0, 0, 0]])

array([['Male', 1],
       [None, 2],
       ['Female', None]], dtype=object)

In [17]:
# Detault Featuers with a default prefixes are as;
#enc.get_feature_names()      # Deprecated
enc.get_feature_names_out()

array(['x0_Female', 'x0_Male', 'x1_1', 'x1_2', 'x1_3'], dtype=object)

In [18]:
# Adding custom prefixes to the features, gender prefix is added for Male and Female and group for numbers
enc.get_feature_names(['gender', 'group'])

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'

### One can always drop the first column for each Feature/ Category:

In [19]:
# To drop first column use drop="first", In this case all bits will be considered zeros for first value.
# When the drop parameter is specified, handle_unknown='ignore' throws error, as in this case conflict 
# occurs b/w "Missing values" and "First value", as all bits will be considered zeros for both.

#drop_en = OneHotEncoder(drop='first', handle_unknown='ignore') # Will give you warning during transform
drop_en = OneHotEncoder(drop='first')

X = [['Male', 1], ['Female', 3], ['Female', 2]]
drop_en.fit(X)
drop_en.categories_


[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [20]:
#drop_en.transform([['Female', 4]]).toarray() # Not Allowed, as 4 is unknown category in this case

# Here ['Male', 4] will create error, as 4 is not in the dataset, further, the drop parameter is used, 
# which does not allow unknown attributes, as all zeros bits are already assigned to first option


# One Hot encode value for Female = 0 and Male = 1
# One Hot encode value for decimal 1 = 0 0, For 2 = 1 0, and For 3 = 0 1
drop_en.transform([['Female', 1], ['Male', 2], ['Male', 3]]).toarray()


array([[0., 0., 0.],
       [1., 1., 0.],
       [1., 0., 1.]])

In [21]:
#drop_en.get_feature_names()   # Deprecated
drop_en.get_feature_names_out()

array(['x0_Male', 'x1_2', 'x1_3'], dtype=object)

In [22]:
drop_en.inverse_transform([[1, 0, 1], [0, 1, 0], [0, 0, 0]])

array([['Male', 3],
       ['Female', 2],
       ['Female', 1]], dtype=object)

###  Drop a column for feature only having 2 columns or dictinct values (Binary)

* This works only with latest version of scikit-learn (0.23 and above)
* To install latest version, run command below
* pip intall scikit-learn == 0.23


In [23]:
# Works same as drop='first', but it is automatic, if a column has binary values, then drop option will 
# be applied, otherwise if not binary values then drop will not be applied

drop_binary_enc = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
X = [['Male', 1], ['Female', 3], ['Female', 2]]

In [24]:
drop_binary_enc.fit(X)
drop_binary_enc.transform([['Female', 1], ['Male', 2], ['None', 3]]).toarray()



array([[0., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [25]:
drop_binary_enc.inverse_transform([[1, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) 

array([['Male', 2],
       ['Female', 1],
       ['Female', 3]], dtype=object)

In [28]:
drop_binary_enc.inverse_transform([[0, 0, 0, 0]]) 
#Inverse for [0, 0, 0, 0] will not work, as to which last three zeros will be replaced, as infinit possibilities 


array([['Female', None]], dtype=object)

In [29]:
import sklearn
sklearn.__version__

'1.4.1.post1'

### Normalization
    1. Min-Max Normalization                   [0 , 1]
    2. Mean Normalization                      [-0.5 , 0.5] # Implemenation Missing here in sklearn
    3. Z-score Normalization (Standarization)  [-3 , 3]
    4. Max-Absolute Normalization              [-1, 1]

#### 1. Min-Max Normalization [0,1]

In [30]:
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], 
        [2, 6], 
        [0, 10], 
        [1, 18]]
data

[[-1, 2], [2, 6], [0, 10], [1, 18]]

In [31]:
scaler = MinMaxScaler()
print(scaler.fit(data))


MinMaxScaler()


In [32]:
print("Maximum value in both dimensions: ", scaler.data_max_)
print("Minimum value in both dimensions: ", scaler.data_min_)


Maximum value in both dimensions:  [ 2. 18.]
Minimum value in both dimensions:  [-1.  2.]


In [33]:
print(scaler.transform(data))

[[0.         0.        ]
 [1.         0.25      ]
 [0.33333333 0.5       ]
 [0.66666667 1.        ]]


In [34]:
print(scaler.transform([[1, 18]]))

[[0.66666667 1.        ]]


In [None]:
print(scaler.transform([[2, 2]]))

#### 2. Mean Normalization

####   3. Z-score Normalization Majority values will be in range [-3 , 3]

In [139]:
from sklearn.preprocessing import StandardScaler
data = [[-1, 2], 
        [2, 6], 
        [0, 10], 
        [1, 18]]

scaler = StandardScaler(with_mean=True, with_std=True)

In [140]:
print(scaler.fit(data))


StandardScaler()


In [141]:
print(scaler.mean_)

[0.5 9. ]


In [142]:
print(scaler.transform(data))

[[-1.34164079 -1.18321596]
 [ 1.34164079 -0.50709255]
 [-0.4472136   0.16903085]
 [ 0.4472136   1.52127766]]


#### 3. Max-Absolute Normalization [-1, 1]
##### Ensures range [-1, 1] i.e. [1, 2, 3, 4, 5, -10] => The max is obtained after applying absolute i.e. |X|, here it is |-10| = 10

In [146]:
from sklearn.preprocessing import MaxAbsScaler

data = [[-1, 2], 
        [2, 6], 
        [0, 10], 
        [1, 18]]

max_abs_scaler = MaxAbsScaler()
maxabs = max_abs_scaler.fit_transform(data)
print(maxabs)

[[-0.5         0.11111111]
 [ 1.          0.33333333]
 [ 0.          0.55555556]
 [ 0.5         1.        ]]


In [147]:
data_test = [[ -1, 18]]
X_test_maxabs = max_abs_scaler.transform(data_test)
X_test_maxabs


array([[-0.5,  1. ]])

In [148]:
import numpy as np

In [149]:
a = np.array([
    [1,2],    
    [2,3,3] ])

  a = np.array([


In [150]:
a.shape

(2,)