# COURSE 4 / DIMENTIONALITY REDUCTION

## Importing Necessary Libraries

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action = 'ignore')

## Loading the Dataset

In [2]:
# Reading the dataset
data = pd.read_csv('pca_dataset.csv')

## Exploring the dataset

In [3]:
# Finding the number of rows and columns
data.shape

(550068, 12)

In [4]:
# Viewing the first five rows
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [5]:
# Checking for missing values
data.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

### Two variables have missing values. Let's check the percentage of missing value using the missing value ratio. This can help us in deciding whether to drop that variable or not.

In [6]:
# Calculating the missing value ratio
data.isnull().sum()/len(data)*100

User_ID                        0.000000
Product_ID                     0.000000
Gender                         0.000000
Age                            0.000000
Occupation                     0.000000
City_Category                  0.000000
Stay_In_Current_City_Years     0.000000
Marital_Status                 0.000000
Product_Category_1             0.000000
Product_Category_2            31.566643
Product_Category_3            69.672659
Purchase                       0.000000
dtype: float64

### Product_Category_2 have 31% of missing values whereas Product_Category_3 have 69%  which is quite high. We can drop that variable.

In [7]:
data['Product_ID'].value_counts()

P00265242    1880
P00025442    1615
P00110742    1612
P00112142    1562
P00057642    1470
             ... 
P00314842       1
P00126742       1
P00135942       1
P00275042       1
P00107442       1
Name: Product_ID, Length: 3631, dtype: int64

In [8]:
data['User_ID'].value_counts()

1001680    1026
1004277     979
1001941     898
1001181     862
1000889     823
           ... 
1005391       7
1002690       7
1002111       7
1005608       7
1000708       6
Name: User_ID, Length: 5891, dtype: int64

### The 'Product_ID' and 'User_ID' does not contribute any useful information. Thus we can drop those variables.

In [9]:
data = data.drop(columns=['User_ID','Product_ID','Product_Category_3'],axis=1)

In [10]:
data['Product_Category_2'].value_counts()

8.0     64088
14.0    55108
2.0     49217
16.0    43255
15.0    37855
5.0     26235
4.0     25677
6.0     16466
11.0    14134
17.0    13320
13.0    10531
9.0      5693
12.0     5528
10.0     3043
3.0      2884
18.0     2770
7.0       626
Name: Product_Category_2, dtype: int64

### We can impute the Product_Category_2 with the median which is the desirable method. 

In [11]:
data['Product_Category_2'].median()

9.0

In [12]:
data['Product_Category_2'].fillna(data['Product_Category_2'].median(), inplace=True)

In [13]:
data.isnull().sum()

Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Purchase                      0
dtype: int64

### We have successfully imputed the missing values.

In [14]:
data.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,F,0-17,10,A,2,0,3,9.0,8370
1,F,0-17,10,A,2,0,1,6.0,15200
2,F,0-17,10,A,2,0,12,9.0,1422
3,F,0-17,10,A,2,0,12,14.0,1057
4,M,55+,16,C,4+,0,8,9.0,7969


### Let's check for the categorical variables in the dataset.

In [15]:
data.dtypes

Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Purchase                        int64
dtype: object

### 'Gender','Age', 'City_Category','Stay_In_Current_City_Years' are all Categorical variables. Thus encoding is necessary. Here we are using Label Encoding since One-Hot Encoding would increase the dimension of the dataset drastically.

In [16]:
# Importing the library and creating an instance
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [17]:
data['Gender'].value_counts()

M    414259
F    135809
Name: Gender, dtype: int64

In [18]:
data['Gender']= data['Gender'].map({'M':0,'F':1})

In [19]:
data['Age'].value_counts()

26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
Name: Age, dtype: int64

In [20]:
data['Age']=data['Age'].map({'0-17':0,'18-25':1,'26-35':2,'36-45':3,'46-50':4,'51-55':5,'55+':6})

In [21]:
data['City_Category'].value_counts()

B    231173
C    171175
A    147720
Name: City_Category, dtype: int64

In [22]:
data['City_Category']= data['City_Category'].map({'A':0,'B':1,'C':2})

In [23]:
data['Stay_In_Current_City_Years'].value_counts()

1     193821
2     101838
3      95285
4+     84726
0      74398
Name: Stay_In_Current_City_Years, dtype: int64

In [24]:
data['Stay_In_Current_City_Years']=data['Stay_In_Current_City_Years'].map({'0':0,'1':1,'2':2,'3':3,'4+':4})

In [25]:
data.dtypes

Gender                          int64
Age                             int64
Occupation                      int64
City_Category                   int64
Stay_In_Current_City_Years      int64
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Purchase                        int64
dtype: object

### All the categorical variables have been successfully converted into numerical.

In [26]:
# Checking the five rows after encoding
data.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,1,0,10,0,2,0,3,9.0,8370
1,1,0,10,0,2,0,1,6.0,15200
2,1,0,10,0,2,0,12,9.0,1422
3,1,0,10,0,2,0,12,14.0,1057
4,0,6,16,2,4,0,8,9.0,7969


### To find the correlation between the independant variables, we need to drop the target variable.

In [27]:
# Dropping the target variable
df = data.drop('Purchase',axis=1)

### Finding the Correlation matrix

In [28]:
# Calculalting the correlation between independent variables
df.corr()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
Gender,1.0,0.004262,-0.117291,0.004515,-0.01466,0.011603,0.045594,0.014051
Age,0.004262,1.0,0.091463,0.123079,-0.004712,0.311738,0.061197,0.043503
Occupation,-0.117291,0.091463,1.0,0.034479,0.030005,0.02428,-0.007618,0.000557
City_Category,0.004515,0.123079,0.034479,1.0,0.019946,0.03979,-0.014364,-0.006888
Stay_In_Current_City_Years,-0.01466,-0.004712,0.030005,0.019946,1.0,-0.012819,-0.004213,-0.001087
Marital_Status,0.011603,0.311738,0.02428,0.03979,-0.012819,1.0,0.019888,0.011526
Product_Category_1,0.045594,0.061197,-0.007618,-0.014364,-0.004213,0.019888,1.0,0.331691
Product_Category_2,0.014051,0.043503,0.000557,-0.006888,-0.001087,0.011526,0.331691,1.0


### Finding the absolute values of the elements of the matrix since a positive or negative doesn't impact the relation between them. 

In [29]:
# Creating correlation matrix
corr_matrix = df.corr().abs()

### Since the correlation matrix is symmetric, we will see the upper triangle only for easy understanding.

In [30]:
# Selecting upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [31]:
upper

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
Gender,,0.004262,0.117291,0.004515,0.01466,0.011603,0.045594,0.014051
Age,,,0.091463,0.123079,0.004712,0.311738,0.061197,0.043503
Occupation,,,,0.034479,0.030005,0.02428,0.007618,0.000557
City_Category,,,,,0.019946,0.03979,0.014364,0.006888
Stay_In_Current_City_Years,,,,,,0.012819,0.004213,0.001087
Marital_Status,,,,,,,0.019888,0.011526
Product_Category_1,,,,,,,,0.331691
Product_Category_2,,,,,,,,


### 'Age' and 'Marital Status' seems to have a correlation of 0.311 
### 'Product_Category_1' and 'Product_Category_2' seems to have a correlation of 0.331
### All the remaining variables have only negligible correlation.

In [32]:
data.shape

(550068, 9)

## PCA IMPLEMENTATION

### 1. Seperating the dependant and the independant variables into 2 dataframes

In [33]:
#seperating independent and dependent variables
x = data.drop(['Purchase'], axis=1)
y = data['Purchase']
x.shape, y.shape

((550068, 8), (550068,))

In [34]:
x.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,1,0,10,0,2,0,3,9.0
1,1,0,10,0,2,0,1,6.0
2,1,0,10,0,2,0,12,9.0
3,1,0,10,0,2,0,12,14.0
4,0,6,16,2,4,0,8,9.0


In [35]:
y.head()

0     8370
1    15200
2     1422
3     1057
4     7969
Name: Purchase, dtype: int64

### 2. The first step in PCA is to standardize the dataset. Here I'am using the MinMAx scaler which converts all entries between 0 and 1.

In [36]:
# Importing MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

In [37]:
# Converting back into a dataframe
x = pd.DataFrame(x_scaled)

In [38]:
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.0,0.5,0.0,0.5,0.0,0.105263,0.4375
1,1.0,0.0,0.5,0.0,0.5,0.0,0.0,0.25
2,1.0,0.0,0.5,0.0,0.5,0.0,0.578947,0.4375
3,1.0,0.0,0.5,0.0,0.5,0.0,0.578947,0.75
4,0.0,1.0,0.8,1.0,1.0,0.0,0.368421,0.4375


### 3.Finding the Covariance matrix

In [39]:
covariance_matrix = np.cov(data.T)

In [40]:
covariance_matrix

array([[ 1.85938161e-01,  2.48769994e-03, -3.29893421e-01,
         1.47996414e-03, -8.15141229e-03,  2.46049537e-03,
         7.73878331e-02,  2.56042438e-02, -1.30707141e+02],
       [ 2.48769994e-03,  1.83231859e+00,  8.07553720e-01,
         1.26654108e-01, -8.22457632e-03,  2.07516645e-01,
         3.26069345e-01,  2.48858988e-01,  1.07692598e+02],
       [-3.29893421e-01,  8.07553720e-01,  4.25450998e+01,
         1.70966155e-01,  2.52356407e-01,  7.78816068e-02,
        -1.95577573e-01,  1.53416510e-02,  6.82554656e+02],
       [ 1.47996414e-03,  1.26654108e-01,  1.70966155e-01,
         5.77920280e-01,  1.95518317e-02,  1.48756227e-02,
        -4.29823154e-02, -2.21286931e-02,  2.36424706e+02],
       [-8.15141229e-03, -8.22457632e-03,  2.52356407e-01,
         1.95518317e-02,  1.66266210e+00, -8.12851028e-03,
        -2.13836697e-02, -5.92165240e-03,  3.51199606e+01],
       [ 2.46049537e-03,  2.07516645e-01,  7.78816068e-02,
         1.48756227e-02, -8.12851028e-03,  2.418378

### 4. Finding the eigen_values and eigen_vectors

In [41]:
eigen_values,eigen_vectors = np.linalg.eig(covariance_matrix)

In [71]:
eigen_values

[25231188.24117388,
 42.54791402783445,
 20.540189356522717,
 10.555722156684334,
 0.18241100622170725,
 0.2152456667275663,
 0.5628305672179182,
 1.8432978345674969,
 1.66063893553718]

In [72]:
eigen_vectors

array([[ 5.18038104e-06, -7.69978584e-03,  1.57412816e-03,
        -2.87309134e-03,  9.98312054e-01,  5.64718161e-02,
         8.35624658e-03,  5.60924689e-03, -3.55919044e-03],
       [-4.26822957e-06,  1.98112002e-02,  2.20990646e-02,
        -1.71279587e-02,  2.80815291e-03, -1.28013162e-01,
        -9.53422375e-02,  9.84442806e-01,  6.49056579e-02],
       [-2.70520686e-05,  9.99735104e-01, -4.46934181e-03,
        -1.73872083e-03,  7.75494821e-03,  1.00638242e-03,
        -1.87661694e-03, -1.96243996e-02, -7.54531477e-03],
       [-9.37033636e-06,  3.98344964e-03,  1.05786514e-03,
        -1.45962061e-03, -8.89761780e-03,  2.55913855e-03,
         9.95131755e-01,  9.50194932e-02,  2.40676373e-02],
       [-1.39192711e-06,  6.14606311e-03, -4.79416711e-04,
         9.98844498e-04,  3.85367739e-03,  4.92338209e-03,
        -1.77340946e-02, -6.69542931e-02,  9.97559288e-01],
       [ 4.53660679e-08,  1.94113181e-03,  2.23300029e-03,
        -2.14141939e-03, -5.65779456e-02,  9.901471

In [73]:
eigen_sorted = pd.DataFrame({'Eigen_values':eigen_values},data.columns)

In [74]:
eigen_sorted

Unnamed: 0,Eigen_values
Gender,25231190.0
Age,42.54791
Occupation,20.54019
City_Category,10.55572
Stay_In_Current_City_Years,0.182411
Marital_Status,0.2152457
Product_Category_1,0.5628306
Product_Category_2,1.843298
Purchase,1.660639


In [75]:
eigen_values= [abs(number)for number in eigen_values ]

In [76]:
eigen_values

[25231188.24117388,
 42.54791402783445,
 20.540189356522717,
 10.555722156684334,
 0.18241100622170725,
 0.2152456667275663,
 0.5628305672179182,
 1.8432978345674969,
 1.66063893553718]

### From the eigen_values, it is inferred that 'Age' and 'Gender' are the two most important components.

### 5.Implementing the Linear Regression model

In [44]:
# Importing Train test split
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state=40)

### 

In [45]:
# Importing Linear Regression and metric mean square error
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_absolute_error as mae

In [46]:
# Creating instance of Linear Regresssion
lr = LR()

# Fitting the model
lr.fit(train_x, train_y)

LinearRegression()

In [47]:
# Predicting over the Train Set and calculating error
train_predict = lr.predict(train_x)
k = mae(train_predict, train_y)
print('Training Mean Absolute Error', k )

Training Mean Absolute Error 3593.3205604592727


In [48]:
# Predicting over the Test Set and calculating error
test_predict = lr.predict(test_x)
k = mae(test_predict, test_y)
print('Test Mean Absolute Error    ', k )

Test Mean Absolute Error     3591.148404914198


### The training and testing error are almost close to each other. 

### Since, we have only 8 features in our dataset now, lets try implementing PCA with 6 features and test the error.

In [49]:
from sklearn.decomposition import PCA

In [77]:
pca= PCA(6).fit(train_x)

In [78]:
# transform both training and testing data
X_train_transformed = pca.transform(train_x)
X_test_transformed = pca.transform(test_x)

In [79]:
x.shape

(550068, 8)

In [80]:
X_train_transformed.shape

(412551, 6)

In [81]:
lr.fit(X_train_transformed,train_y)

LinearRegression()

In [82]:
# predicting for the transformed test set
pred_2 = lr.predict(X_test_transformed)

In [83]:
# checking the training performance after transformation- Accuracy Sco
n= mae(lr.predict(X_train_transformed), train_y)

In [84]:
n

3817.81284410544

In [88]:
# checking the validation performance after transformation- Accuracy Score
n= mae(pred_2, test_y)

In [89]:
n

3813.7785206091853

### The error seems to have increased with 6 features than when the features were 8 .