In [1]:
# Importing Libraries
import pandas as pd

In [2]:
# Ignoring Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing Dataset
df = pd.read_csv('data.csv')

In [4]:
# Dimensions of Dataset
df.shape

(50882, 14)

In [5]:
# Checking for Null Count
df.isnull().sum()

ID                             0
City_Code                      0
Region_Code                    0
Accomodation_Type              0
Reco_Insurance_Type            0
Upper_Age                      0
Lower_Age                      0
Is_Spouse                      0
Health_Indicator           11691
Holding_Policy_Duration    20251
Holding_Policy_Type        20251
Reco_Policy_Cat                0
Reco_Policy_Premium            0
Response                       0
dtype: int64

In [6]:
# Dropping Columns with high percentage of Null Values
df = df.drop(columns=['ID', 'Holding_Policy_Duration', 'Holding_Policy_Type'], axis=1)

In [7]:
# Checking for Null Count
df.isnull().sum()

City_Code                  0
Region_Code                0
Accomodation_Type          0
Reco_Insurance_Type        0
Upper_Age                  0
Lower_Age                  0
Is_Spouse                  0
Health_Indicator       11691
Reco_Policy_Cat            0
Reco_Policy_Premium        0
Response                   0
dtype: int64

In [8]:
# Dropping Null Values
df.dropna(axis=0, how='any', inplace=True)

In [9]:
# Checking for Null Count after removing all Null containing Rows
df.isnull().sum()

City_Code              0
Region_Code            0
Accomodation_Type      0
Reco_Insurance_Type    0
Upper_Age              0
Lower_Age              0
Is_Spouse              0
Health_Indicator       0
Reco_Policy_Cat        0
Reco_Policy_Premium    0
Response               0
dtype: int64

In [10]:
# Dimensions of Dataset after removing all Null containing Rows
df.shape

(39191, 11)

In [11]:
# Data
df.head()

Unnamed: 0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health_Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,C3,3213,Rented,Individual,36,36,No,X1,22,11628.0,0
1,C5,1117,Owned,Joint,75,22,No,X2,22,30510.0,0
3,C24,4378,Owned,Joint,52,48,No,X1,19,17780.0,0
4,C8,2190,Rented,Individual,44,44,No,X2,16,10404.0,0
5,C9,1785,Rented,Individual,52,52,No,X2,22,15264.0,1


In [12]:
# Resetting Index
df.reset_index(drop=True, inplace=True)

In [13]:
# Data after Resetting Index
df.head()

Unnamed: 0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health_Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,C3,3213,Rented,Individual,36,36,No,X1,22,11628.0,0
1,C5,1117,Owned,Joint,75,22,No,X2,22,30510.0,0
2,C24,4378,Owned,Joint,52,48,No,X1,19,17780.0,0
3,C8,2190,Rented,Individual,44,44,No,X2,16,10404.0,0
4,C9,1785,Rented,Individual,52,52,No,X2,22,15264.0,1


In [14]:
# Dataset Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39191 entries, 0 to 39190
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   City_Code            39191 non-null  object 
 1   Region_Code          39191 non-null  int64  
 2   Accomodation_Type    39191 non-null  object 
 3   Reco_Insurance_Type  39191 non-null  object 
 4   Upper_Age            39191 non-null  int64  
 5   Lower_Age            39191 non-null  int64  
 6   Is_Spouse            39191 non-null  object 
 7   Health_Indicator     39191 non-null  object 
 8   Reco_Policy_Cat      39191 non-null  int64  
 9   Reco_Policy_Premium  39191 non-null  float64
 10  Response             39191 non-null  int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 3.3+ MB


## Continuous Feature Selection

### Features to Target
- City_Code, Region_Code, 
- Upper_Age, Lower_Age, 
- Health_Indicator, Reco_Policy_Cat, Reco_Policy_Premium

In [15]:
# Generating Dataset with all Continuous Features
con_col_set = ['City_Code', 'Region_Code', 'Upper_Age', 'Lower_Age', 'Health_Indicator', 'Reco_Policy_Cat', 'Reco_Policy_Premium', 'Response']
df_con = df[con_col_set]
df_con.head()

Unnamed: 0,City_Code,Region_Code,Upper_Age,Lower_Age,Health_Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,C3,3213,36,36,X1,22,11628.0,0
1,C5,1117,75,22,X2,22,30510.0,0
2,C24,4378,52,48,X1,19,17780.0,0
3,C8,2190,44,44,X2,16,10404.0,0
4,C9,1785,52,52,X2,22,15264.0,1


In [16]:
# Fetching all Categorical Columns
cat_col_set = ['City_Code', 'Health_Indicator']
cols = cat_col_set
for col in cols:
    category_set = list(dict(df[col].value_counts()).keys())
    print(f'{col} : {category_set}\n')

City_Code : ['C1', 'C2', 'C3', 'C4', 'C9', 'C6', 'C7', 'C8', 'C10', 'C5', 'C15', 'C17', 'C16', 'C11', 'C13', 'C20', 'C19', 'C12', 'C18', 'C14', 'C21', 'C23', 'C24', 'C22', 'C26', 'C29', 'C25', 'C28', 'C33', 'C27', 'C32', 'C34', 'C30', 'C35', 'C36', 'C31']

Health_Indicator : ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9']



In [17]:
# Setting Encoding Criteria
enc = {
    'City_Code': {f'C{i}': i for i in range(1, 37)},
    'Health_Indicator': {f'X{i}': i for i in range(1, 10)}
}

In [18]:
# Encoding Criteria Preview
enc

{'City_Code': {'C1': 1,
  'C2': 2,
  'C3': 3,
  'C4': 4,
  'C5': 5,
  'C6': 6,
  'C7': 7,
  'C8': 8,
  'C9': 9,
  'C10': 10,
  'C11': 11,
  'C12': 12,
  'C13': 13,
  'C14': 14,
  'C15': 15,
  'C16': 16,
  'C17': 17,
  'C18': 18,
  'C19': 19,
  'C20': 20,
  'C21': 21,
  'C22': 22,
  'C23': 23,
  'C24': 24,
  'C25': 25,
  'C26': 26,
  'C27': 27,
  'C28': 28,
  'C29': 29,
  'C30': 30,
  'C31': 31,
  'C32': 32,
  'C33': 33,
  'C34': 34,
  'C35': 35,
  'C36': 36},
 'Health_Indicator': {'X1': 1,
  'X2': 2,
  'X3': 3,
  'X4': 4,
  'X5': 5,
  'X6': 6,
  'X7': 7,
  'X8': 8,
  'X9': 9}}

In [19]:
# Encoding
df_con.replace(enc, inplace=True)

In [20]:
# Updated Dataset Information
df_con.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39191 entries, 0 to 39190
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   City_Code            39191 non-null  int64  
 1   Region_Code          39191 non-null  int64  
 2   Upper_Age            39191 non-null  int64  
 3   Lower_Age            39191 non-null  int64  
 4   Health_Indicator     39191 non-null  int64  
 5   Reco_Policy_Cat      39191 non-null  int64  
 6   Reco_Policy_Premium  39191 non-null  float64
 7   Response             39191 non-null  int64  
dtypes: float64(1), int64(7)
memory usage: 2.4 MB


In [21]:
# Updated Continuous Feature Set
df_con.head()

Unnamed: 0,City_Code,Region_Code,Upper_Age,Lower_Age,Health_Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,3,3213,36,36,1,22,11628.0,0
1,5,1117,75,22,2,22,30510.0,0
2,24,4378,52,48,1,19,17780.0,0
3,8,2190,44,44,2,16,10404.0,0
4,9,1785,52,52,2,22,15264.0,1


In [22]:
# Pearson's Correlation Coefficient
df_con.corr()

Unnamed: 0,City_Code,Region_Code,Upper_Age,Lower_Age,Health_Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response
City_Code,1.0,0.092299,-0.109715,-0.107533,0.001877,-0.082511,-0.117606,-0.009147
Region_Code,0.092299,1.0,-0.005656,-0.005493,0.021381,-0.065544,-0.010576,0.003144
Upper_Age,-0.109715,-0.005656,1.0,0.921494,0.027064,0.02654,0.793216,0.002719
Lower_Age,-0.107533,-0.005493,0.921494,1.0,0.020844,0.02255,0.616407,-0.000758
Health_Indicator,0.001877,0.021381,0.027064,0.020844,1.0,-0.009032,0.042084,-0.001184
Reco_Policy_Cat,-0.082511,-0.065544,0.02654,0.02255,-0.009032,1.0,0.060825,0.115884
Reco_Policy_Premium,-0.117606,-0.010576,0.793216,0.616407,0.042084,0.060825,1.0,0.006844
Response,-0.009147,0.003144,0.002719,-0.000758,-0.001184,0.115884,0.006844,1.0
