# Scalling (for numerical data)

In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('no_outliers/data_nooutliers1.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,...,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,0,0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,...,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,1,1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,...,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,2,2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,...,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,3,3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,...,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,4,4,200004,0,32.0,Self Enquiry,1,8.0,Small Business,Male,...,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [3]:
data.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'CustomerID', 'ProdTaken', 'Age',
       'TypeofContact', 'CityTier', 'DurationOfPitch', 'Occupation', 'Gender',
       'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched',
       'PreferredPropertyStar', 'MaritalStatus', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'OwnCar', 'NumberOfChildrenVisiting',
       'Designation', 'MonthlyIncome'],
      dtype='object')

In [4]:
data.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'],axis=1,inplace=True)

In [5]:
data

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,32.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4874,204883,1,49.0,Self Enquiry,3,9.0,Small Business,Male,3,5.0,Deluxe,4.0,Unmarried,2.0,1,1,1,1.0,Manager,26576.0
4875,204884,1,28.0,Company Invited,1,31.0,Salaried,Male,4,5.0,Basic,3.0,Single,3.0,1,3,1,2.0,Executive,21212.0
4876,204885,1,52.0,Self Enquiry,3,17.0,Salaried,Female,4,4.0,Standard,4.0,Married,7.0,0,1,1,3.0,Senior Manager,31820.0
4877,204886,1,19.0,Self Enquiry,3,16.0,Small Business,Male,3,4.0,Basic,3.0,Single,3.0,0,5,0,2.0,Executive,20289.0


## Scaling Methods:
1. Absolute Maximum Scaling
2. Min-Max Scaling
3. Normalization
4. Standardization
5. Robust Scaling

In [11]:
data['CityTier'] = data['CityTier'].astype('O')
data['NumberOfPersonVisiting'] = data['CityTier'].astype('O')
data['PreferredPropertyStar'] = data['CityTier'].astype('O')
data['Passport'] = data['CityTier'].astype('O')
data['PitchSatisfactionScore'] = data['CityTier'].astype('O')
data['OwnCar'] = data['CityTier'].astype('O')
data['ProdTaken'] = data['CityTier'].astype('O')

In [12]:
num_data=data[[fea for fea in data.columns if data[fea].dtype != 'O']]

In [13]:
num_data.head()

Unnamed: 0,CustomerID,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,200000,41.0,6.0,3.0,1.0,0.0,20993.0
1,200001,49.0,14.0,4.0,2.0,2.0,20130.0
2,200002,37.0,8.0,4.0,7.0,0.0,17090.0
3,200003,33.0,9.0,3.0,2.0,1.0,17909.0
4,200004,32.0,8.0,3.0,1.0,0.0,18468.0


In [14]:
num_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4879 entries, 0 to 4878
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4879 non-null   int64  
 1   Age                       4879 non-null   float64
 2   DurationOfPitch           4879 non-null   float64
 3   NumberOfFollowups         4879 non-null   float64
 4   NumberOfTrips             4879 non-null   float64
 5   NumberOfChildrenVisiting  4879 non-null   float64
 6   MonthlyIncome             4879 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 266.9 KB


## Absolute maximum scalling

In [15]:
y1 = num_data.Age
y2 = num_data.MonthlyIncome
y1_new = (y1-min(y1))/(max(y1)-min(y1))
y2_new = (y2-min(y2))/(max(y2)-min(y2))