### 1. Feature Selection Using Chi-Square Test

##### Show head of dataframe

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('datasets/breast-cancer.csv')

In [3]:
df.head()

Unnamed: 0,Grouping,Age (years),Education,Working status,Marital status,Menarche (years),Menopause,First pregnancy,Parity,Breastfeeding,Highfat,BMI,Ethnicity
0,BC,>= 50,Senior high school,Housewife,Marriage,12 to 13,< 50 years,20-29 years,>= Multiparous,>=12 months,Normal,Obesity,Minangnese
1,Non-BC,>= 50,Vocational degree,Master's student,Single/ widow,>13,< 50 years,>30 years,>= Multiparous,<12 months,High,Normal,Minangnese
2,BC,>= 50,Senior high school,Housewife,Marriage,7 to 11,< 50 years,20-29 years,>= Multiparous,>=12 months,Normal,Normal,Javanese
3,Non-BC,>= 50,Senior high school,Master's student,Marriage,12 to 13,< 50 years,20-29 years,Primiparous,>=12 months,High,Normal,Javanese
4,BC,>= 50,Senior high school,Private servant,Marriage,>13,< 50 years,20-29 years,>= Multiparous,>=12 months,High,Normal,Minangnese


##### Check the type of columns in dataframe

In [4]:
df.info

<bound method DataFrame.info of     Grouping Age (years)             Education    Working status  \
0         BC       >= 50    Senior high school         Housewife   
1     Non-BC       >= 50     Vocational degree  Master's student   
2         BC       >= 50    Senior high school         Housewife   
3     Non-BC       >= 50    Senior high school  Master's student   
4         BC       >= 50    Senior high school   Private servant   
..       ...         ...                   ...               ...   
395   Non-BC       >= 50     Vocational degree           Retired   
396       BC        < 50    Senior high school     Civil servant   
397   Non-BC        < 50    Senior high school         Housewife   
398       BC        < 50  Undergraduate degree           Retired   
399   Non-BC       >= 50    Senior high school         Housewife   

    Marital status Menarche (years)    Menopause First pregnancy  \
0         Marriage        12  to 13   < 50 years     20-29 years   
1    Single/ wi

##### Check wheter there's nan value within dataframe

In [5]:
df.isna().sum()

Grouping            0
Age (years)         0
Education           0
Working status      0
Marital status      0
Menarche (years)    0
Menopause           0
First pregnancy     2
Parity              0
Breastfeeding       0
Highfat             0
BMI                 0
Ethnicity           0
dtype: int64

##### Change none value to NaN value

In [6]:
df['First pregnancy'].fillna(value='NaN')


0      20-29 years
1        >30 years
2      20-29 years
3      20-29 years
4      20-29 years
          ...     
395    20-29 years
396    20-29 years
397    20-29 years
398    20-29 years
399            NaN
Name: First pregnancy, Length: 400, dtype: object

##### Drop the rows with nan values

In [7]:
df = df.dropna()

##### Last check nan value

In [None]:
df.isna().sum()

##### Check amount of unique values in column

In [8]:
df.nunique()

Grouping            2
Age (years)         2
Education           7
Working status      7
Marital status      2
Menarche (years)    3
Menopause           2
First pregnancy     3
Parity              3
Breastfeeding       2
Highfat             2
BMI                 3
Ethnicity           2
dtype: int64

##### Encoding Values in columns

In [9]:
from sklearn.preprocessing import OrdinalEncoder

In [10]:
enc = OrdinalEncoder()

In [11]:
df.columns

Index(['Grouping', 'Age (years)', 'Education', 'Working status',
       'Marital status', 'Menarche (years)', 'Menopause', 'First pregnancy',
       'Parity', 'Breastfeeding', 'Highfat', 'BMI', 'Ethnicity'],
      dtype='object')

In [12]:
enc.fit(df[['Grouping', 'Age (years)', 'Education', 'Working status',
       'Marital status', 'Menarche (years)', 'Menopause', 'First pregnancy',
       'Parity', 'Breastfeeding', 'Highfat', 'BMI', 'Ethnicity']])
       

In [13]:
df[['Grouping', 'Age (years)', 'Education', 'Working status',
       'Marital status', 'Menarche (years)', 'Menopause', 'First pregnancy',
       'Parity', 'Breastfeeding', 'Highfat', 'BMI', 'Ethnicity']] = enc.transform(df[['Grouping', 'Age (years)', 'Education', 'Working status',
       'Marital status', 'Menarche (years)', 'Menopause', 'First pregnancy',
       'Parity', 'Breastfeeding', 'Highfat', 'BMI', 'Ethnicity']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Grouping', 'Age (years)', 'Education', 'Working status',


In [14]:
df.head()

Unnamed: 0,Grouping,Age (years),Education,Working status,Marital status,Menarche (years),Menopause,First pregnancy,Parity,Breastfeeding,Highfat,BMI,Ethnicity
0,0.0,1.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
1,1.0,1.0,6.0,4.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
4,0.0,1.0,4.0,5.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [15]:
X = df.iloc[:, 1:13]
y = df.iloc[:, 0]

In [16]:
X.head()

Unnamed: 0,Age (years),Education,Working status,Marital status,Menarche (years),Menopause,First pregnancy,Parity,Breastfeeding,Highfat,BMI,Ethnicity
0,1.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
1,1.0,6.0,4.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
2,1.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.0,4.0,4.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
4,1.0,4.0,5.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


##### SELECTION

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [19]:
X_train.head()

Unnamed: 0,Age (years),Education,Working status,Marital status,Menarche (years),Menopause,First pregnancy,Parity,Breastfeeding,Highfat,BMI,Ethnicity
47,0.0,4.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
279,1.0,5.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
217,1.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
398,0.0,5.0,6.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
121,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0


In [20]:

from sklearn.feature_selection import chi2
f_score = chi2(X_train, y_train)

In [21]:
p_values = pd.Series(f_score[1])
p_values.index = X_train.columns

In [22]:
p_values

Age (years)         3.335503e-01
Education           4.353272e-01
Working status      1.871861e-10
Marital status      2.568393e-01
Menarche (years)    2.697446e-01
Menopause           2.297740e-02
First pregnancy     8.897301e-02
Parity              1.158515e-01
Breastfeeding       6.391934e-02
Highfat             5.015896e-16
BMI                 7.571012e-04
Ethnicity           8.032552e-01
dtype: float64

In [23]:
p_values.sort_values(ascending=False)

Ethnicity           8.032552e-01
Education           4.353272e-01
Age (years)         3.335503e-01
Menarche (years)    2.697446e-01
Marital status      2.568393e-01
Parity              1.158515e-01
First pregnancy     8.897301e-02
Breastfeeding       6.391934e-02
Menopause           2.297740e-02
BMI                 7.571012e-04
Working status      1.871861e-10
Highfat             5.015896e-16
dtype: float64