<a href="https://www.kaggle.com/code/aabdollahii/prostate-cancer-risk?scriptVersionId=254159836" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Reading Data 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv
df = pd.read_csv("/kaggle/input/prostate-cancer-risk-and-lifestyle-synthetic-dataset/synthetic_prostate_cancer_risk.csv")

# Get some basic Information about data

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       1000 non-null   int64  
 1   age                      1000 non-null   int64  
 2   bmi                      1000 non-null   float64
 3   smoker                   1000 non-null   object 
 4   alcohol_consumption      605 non-null    object 
 5   diet_type                1000 non-null   object 
 6   physical_activity_level  1000 non-null   object 
 7   family_history           1000 non-null   object 
 8   mental_stress_level      1000 non-null   object 
 9   sleep_hours              1000 non-null   float64
 10  regular_health_checkup   1000 non-null   object 
 11  prostate_exam_done       1000 non-null   object 
 12  risk_level               1000 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 101.7+ KB


- no null value execpt alcohol_consumption all this is a good news
- a lot of column with object type. We need to encode all of them before modeling. 

In [3]:
#always show all of the columns 
pd.set_option('display.max_columns', None)
df.head(20)

Unnamed: 0,id,age,bmi,smoker,alcohol_consumption,diet_type,physical_activity_level,family_history,mental_stress_level,sleep_hours,regular_health_checkup,prostate_exam_done,risk_level
0,1,39,24.7,No,,Fatty,Moderate,No,High,5.6,No,No,Medium
1,2,43,25.6,Yes,,Mixed,High,No,High,6.9,Yes,No,Low
2,3,72,22.4,No,Moderate,Mixed,Moderate,No,High,7.8,Yes,No,Low
3,4,60,25.6,Yes,,Fatty,Moderate,No,High,5.6,Yes,No,Medium
4,5,51,26.6,Yes,,Mixed,Low,No,Medium,5.9,No,No,Medium
5,6,68,29.4,Yes,Moderate,Mixed,Moderate,Yes,Medium,8.3,No,No,Medium
6,7,39,24.4,Yes,,Mixed,Moderate,No,High,6.0,No,No,Low
7,8,67,25.1,Yes,High,Fatty,High,No,High,6.4,No,No,High
8,9,58,20.6,Yes,Moderate,Healthy,High,No,Medium,8.3,No,No,Medium
9,10,78,26.9,Yes,Moderate,Healthy,Moderate,No,High,8.3,No,No,Medium


# Handle null-values

In [4]:
#only this column has missing value
df['alcohol_consumption'] = df['alcohol_consumption'].fillna('unknown')


In [5]:
# Count the number of NaNs in the 'alcohol_consumption' column
num_nulls = df['alcohol_consumption'].isnull().sum()
print(f"Number of null values in 'alcohol_consumption': {num_nulls}")


Number of null values in 'alcohol_consumption': 0


In [6]:
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0])


Series([], dtype: int64)


- Here we gooo. We are done with null values.Now lets encode all we have

# Encoding Categorical Data

In [7]:
df.head(20)

Unnamed: 0,id,age,bmi,smoker,alcohol_consumption,diet_type,physical_activity_level,family_history,mental_stress_level,sleep_hours,regular_health_checkup,prostate_exam_done,risk_level
0,1,39,24.7,No,unknown,Fatty,Moderate,No,High,5.6,No,No,Medium
1,2,43,25.6,Yes,unknown,Mixed,High,No,High,6.9,Yes,No,Low
2,3,72,22.4,No,Moderate,Mixed,Moderate,No,High,7.8,Yes,No,Low
3,4,60,25.6,Yes,unknown,Fatty,Moderate,No,High,5.6,Yes,No,Medium
4,5,51,26.6,Yes,unknown,Mixed,Low,No,Medium,5.9,No,No,Medium
5,6,68,29.4,Yes,Moderate,Mixed,Moderate,Yes,Medium,8.3,No,No,Medium
6,7,39,24.4,Yes,unknown,Mixed,Moderate,No,High,6.0,No,No,Low
7,8,67,25.1,Yes,High,Fatty,High,No,High,6.4,No,No,High
8,9,58,20.6,Yes,Moderate,Healthy,High,No,Medium,8.3,No,No,Medium
9,10,78,26.9,Yes,Moderate,Healthy,Moderate,No,High,8.3,No,No,Medium


In [8]:
# --- Binary Encoding (Yes/No) ---
binary_map = {'Yes': 1, 'No': 0}
df['smoker'] = df['smoker'].map(binary_map)
df['family_history'] = df['family_history'].map(binary_map)
df['regular_health_checkup'] = df['regular_health_checkup'].map(binary_map)
df['prostate_exam_done'] = df['prostate_exam_done'].map(binary_map)

# --- Ordinal Encoding (Yes/No) ---
alcohol_map = {'Low': 0, 'Moderate': 1, 'High': 2, 'unknown': -1}
df['alcohol_consumption'] = df['alcohol_consumption'].map(alcohol_map)

df['diet_type'] = df['diet_type'].map({'Healthy':0, 'Mixed':1, 'Fatty':2})


activity_map = {'Low': 0, 'Moderate': 1, 'High': 2}
df['physical_activity_level'] = df['physical_activity_level'].map(activity_map)

stress_map = {'Low': 0, 'Medium': 1, 'High': 2}
df['mental_stress_level'] = df['mental_stress_level'].map(stress_map)

risk_map = {'Low': 0, 'Medium': 1, 'High': 2}
df['risk_level'] = df['risk_level'].map(risk_map)



- I used map function becuase all of the categorical function have a order

In [9]:
df.head(10)

Unnamed: 0,id,age,bmi,smoker,alcohol_consumption,diet_type,physical_activity_level,family_history,mental_stress_level,sleep_hours,regular_health_checkup,prostate_exam_done,risk_level
0,1,39,24.7,0,-1,2,1,0,2,5.6,0,0,1
1,2,43,25.6,1,-1,1,2,0,2,6.9,1,0,0
2,3,72,22.4,0,1,1,1,0,2,7.8,1,0,0
3,4,60,25.6,1,-1,2,1,0,2,5.6,1,0,1
4,5,51,26.6,1,-1,1,0,0,1,5.9,0,0,1
5,6,68,29.4,1,1,1,1,1,1,8.3,0,0,1
6,7,39,24.4,1,-1,1,1,0,2,6.0,0,0,0
7,8,67,25.1,1,2,2,2,0,2,6.4,0,0,2
8,9,58,20.6,1,1,0,2,0,1,8.3,0,0,1
9,10,78,26.9,1,1,0,1,0,2,8.3,0,0,1
