## Before running this notebook make sure to follow these steps
- Install `git` and make sure to add its path to `Environment Variables` of your OS.
- Clone [this](https://github.com/jundongl/scikit-feature) GitHub repository by typing _`git clone https://github.com/jundongl/scikit-feature`_ in your command shell.
- Go inside the cloned folder by typing _`cd ./scikit-feature`_.
- Then install `Scikit-Feature` from the cloned repository by simply typing _`python setup.py install`_.
- Make sure you have `Numpy`, `Scikit-Learn`, `Scipy` already installed on your Python Env.

In [1]:
# Importing Libraries
import pandas as pd
from skfeature.function.similarity_based import fisher_score

In [2]:
# Ignoring Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing Dataset
df = pd.read_csv('data.csv')

In [4]:
# Dimensions of Dataset
df.shape

(50882, 14)

In [5]:
# Checking for Null Count
df.isnull().sum()

ID                             0
City_Code                      0
Region_Code                    0
Accomodation_Type              0
Reco_Insurance_Type            0
Upper_Age                      0
Lower_Age                      0
Is_Spouse                      0
Health_Indicator           11691
Holding_Policy_Duration    20251
Holding_Policy_Type        20251
Reco_Policy_Cat                0
Reco_Policy_Premium            0
Response                       0
dtype: int64

In [6]:
# Dropping Columns with high percentage of Null Values
df = df.drop(columns=['Holding_Policy_Duration', 'Holding_Policy_Type'], axis=1)

In [7]:
# Checking for Null Count
df.isnull().sum()

ID                         0
City_Code                  0
Region_Code                0
Accomodation_Type          0
Reco_Insurance_Type        0
Upper_Age                  0
Lower_Age                  0
Is_Spouse                  0
Health_Indicator       11691
Reco_Policy_Cat            0
Reco_Policy_Premium        0
Response                   0
dtype: int64

In [8]:
# Dropping Null Values
df.dropna(axis=0, how='any', inplace=True)

In [9]:
# Checking for Null Count after removing null from everywhere
df.isnull().sum()

ID                     0
City_Code              0
Region_Code            0
Accomodation_Type      0
Reco_Insurance_Type    0
Upper_Age              0
Lower_Age              0
Is_Spouse              0
Health_Indicator       0
Reco_Policy_Cat        0
Reco_Policy_Premium    0
Response               0
dtype: int64

In [10]:
# Dimensions of Dataset after removing all Null
df.shape

(39191, 12)

In [11]:
# Data
df.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health_Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,1,C3,3213,Rented,Individual,36,36,No,X1,22,11628.0,0
1,2,C5,1117,Owned,Joint,75,22,No,X2,22,30510.0,0
3,4,C24,4378,Owned,Joint,52,48,No,X1,19,17780.0,0
4,5,C8,2190,Rented,Individual,44,44,No,X2,16,10404.0,0
5,6,C9,1785,Rented,Individual,52,52,No,X2,22,15264.0,1


In [12]:
# Resetting Index
df.reset_index(drop=True, inplace=True)

In [13]:
# Data after Resetting Index
df.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health_Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,1,C3,3213,Rented,Individual,36,36,No,X1,22,11628.0,0
1,2,C5,1117,Owned,Joint,75,22,No,X2,22,30510.0,0
2,4,C24,4378,Owned,Joint,52,48,No,X1,19,17780.0,0
3,5,C8,2190,Rented,Individual,44,44,No,X2,16,10404.0,0
4,6,C9,1785,Rented,Individual,52,52,No,X2,22,15264.0,1


In [14]:
# Dataset Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39191 entries, 0 to 39190
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   39191 non-null  int64  
 1   City_Code            39191 non-null  object 
 2   Region_Code          39191 non-null  int64  
 3   Accomodation_Type    39191 non-null  object 
 4   Reco_Insurance_Type  39191 non-null  object 
 5   Upper_Age            39191 non-null  int64  
 6   Lower_Age            39191 non-null  int64  
 7   Is_Spouse            39191 non-null  object 
 8   Health_Indicator     39191 non-null  object 
 9   Reco_Policy_Cat      39191 non-null  int64  
 10  Reco_Policy_Premium  39191 non-null  float64
 11  Response             39191 non-null  int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 3.6+ MB


## Categorical Feature Selection

### Features to Target
- City_Code, Accomodation_Type, 
- Reco_Insurance_Type, Is_Spouse, 
- Health_Indicator, Reco_Policy_Cat

In [15]:
# Generating Dataset with all Categorical Features
cat_col_set = ['City_Code', 'Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse', 'Health_Indicator', 'Reco_Policy_Cat', 'Response']
df_cat = df[cat_col_set]
df_cat.head()

Unnamed: 0,City_Code,Accomodation_Type,Reco_Insurance_Type,Is_Spouse,Health_Indicator,Reco_Policy_Cat,Response
0,C3,Rented,Individual,No,X1,22,0
1,C5,Owned,Joint,No,X2,22,0
2,C24,Owned,Joint,No,X1,19,0
3,C8,Rented,Individual,No,X2,16,0
4,C9,Rented,Individual,No,X2,22,1


In [16]:
# Calculating Number of Categories in Each Column
cols = df_cat.columns
for col in cols:
    category_set = list(dict(df[col].value_counts()).keys())
    print(f'{col} : {category_set}\n')

City_Code : ['C1', 'C2', 'C3', 'C4', 'C9', 'C6', 'C7', 'C8', 'C10', 'C5', 'C15', 'C17', 'C16', 'C11', 'C13', 'C20', 'C19', 'C12', 'C18', 'C14', 'C21', 'C23', 'C24', 'C22', 'C26', 'C29', 'C25', 'C28', 'C33', 'C27', 'C32', 'C34', 'C30', 'C35', 'C36', 'C31']

Accomodation_Type : ['Owned', 'Rented']

Reco_Insurance_Type : ['Individual', 'Joint']

Is_Spouse : ['No', 'Yes']

Health_Indicator : ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9']

Reco_Policy_Cat : [22, 18, 21, 16, 17, 19, 20, 12, 1, 2, 14, 13, 15, 3, 4, 5, 6, 7, 9, 11, 8, 10]

Response : [0, 1]



### Applying Label Encoding for Fisher's Score Test

In [17]:
# Setting Encoding Criteria
enc = {
    'City_Code': {f'C{i}': i for i in range(1, 37)},
    'Accomodation_Type': {'Owned': 0, 'Rented': 1},
    'Reco_Insurance_Type': {'Individual': 0, 'Joint': 1},
    'Is_Spouse': {'No': 0, 'Yes': 1},
    'Health_Indicator': {f'X{i}': i for i in range(1, 10)}
}

In [18]:
# Encoding Criteria Preview
enc

{'City_Code': {'C1': 1,
  'C2': 2,
  'C3': 3,
  'C4': 4,
  'C5': 5,
  'C6': 6,
  'C7': 7,
  'C8': 8,
  'C9': 9,
  'C10': 10,
  'C11': 11,
  'C12': 12,
  'C13': 13,
  'C14': 14,
  'C15': 15,
  'C16': 16,
  'C17': 17,
  'C18': 18,
  'C19': 19,
  'C20': 20,
  'C21': 21,
  'C22': 22,
  'C23': 23,
  'C24': 24,
  'C25': 25,
  'C26': 26,
  'C27': 27,
  'C28': 28,
  'C29': 29,
  'C30': 30,
  'C31': 31,
  'C32': 32,
  'C33': 33,
  'C34': 34,
  'C35': 35,
  'C36': 36},
 'Accomodation_Type': {'Owned': 0, 'Rented': 1},
 'Reco_Insurance_Type': {'Individual': 0, 'Joint': 1},
 'Is_Spouse': {'No': 0, 'Yes': 1},
 'Health_Indicator': {'X1': 1,
  'X2': 2,
  'X3': 3,
  'X4': 4,
  'X5': 5,
  'X6': 6,
  'X7': 7,
  'X8': 8,
  'X9': 9}}

In [19]:
# Encoding
df_cat.replace(enc, inplace=True)

In [20]:
# Updated Dataset Information
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39191 entries, 0 to 39190
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   City_Code            39191 non-null  int64
 1   Accomodation_Type    39191 non-null  int64
 2   Reco_Insurance_Type  39191 non-null  int64
 3   Is_Spouse            39191 non-null  int64
 4   Health_Indicator     39191 non-null  int64
 5   Reco_Policy_Cat      39191 non-null  int64
 6   Response             39191 non-null  int64
dtypes: int64(7)
memory usage: 2.1 MB


In [21]:
# Updated Categorical Feature Set
df_cat.head()

Unnamed: 0,City_Code,Accomodation_Type,Reco_Insurance_Type,Is_Spouse,Health_Indicator,Reco_Policy_Cat,Response
0,3,1,0,0,1,22,0
1,5,0,1,0,2,22,0
2,24,0,1,0,1,19,0
3,8,1,0,0,2,16,0
4,9,1,0,0,2,22,1


In [22]:
# Feature Set and Class Set
X = df_cat.iloc[0:, :-1]
y = df_cat.iloc[0:, 6:]

#### The below step is a highly computational step performed by Scipy Sparse Matrix generation step, there are 2 scenarios that could happen
- The cell throws a `MemoryError: Unable to allocate <size> GiB for an array with shape (<dimensions>) and data type int64`
- Your system may freeze, because your OS has granted the permission to allocate the required memory size and thus Scipy has started its heavy computational working.

In any case make sure that you are not using your main system, so either do this processing on a Cloud Instance or on a Secondary System. Because it may take long time to complete and until it gets completed you won't be able to even move your mouse.

In [None]:
# Performing Fisher's Score Test
f_p_values = fisher_score.fisher_score(X.values, y.values.reshape(39191,))

In [None]:
# Columns with their associated p-values from Fisher's Score Test
p_values = pd.Series(f_p_values[1])
p_values.index = X.columns
p_values

In [None]:
# Sorting Columns based on their associated Chi-Sq Value
p_values = dict(p_values.sort_values(ascending=True))

In [None]:
# Converting Scientific Notations to Integer
for k in p_values.keys():
    p_values[k] = '{:.12f}'.format(p_values[k])
p_values