#### Dataset source: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

#### Problem Statement: Given the heart disease dataset, our objective is to develop a machine learning model that can predict the presence of heart disease in patients based on various medical and demographic features. This could help in early diagnosis and inform medical professionals to take preventative or remedial action.

### data preparation and cleaning

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats

# 加载数据集
heart_disease_data = pd.read_csv('heart_disease_uci.csv').drop(['id', 'dataset'], axis=1)


# 1. 缺失值处理
# 数值型特征
numeric_features = ['trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
numeric_imputer = SimpleImputer(strategy='median')
heart_disease_data[numeric_features] = numeric_imputer.fit_transform(heart_disease_data[numeric_features])

# 分类特征
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
categorical_imputer = SimpleImputer(strategy='most_frequent')
heart_disease_data[categorical_features] = categorical_imputer.fit_transform(heart_disease_data[categorical_features])

# 2. 异常值检测与处理
for feature in numeric_features:
    Q1 = heart_disease_data[feature].quantile(0.25)
    Q3 = heart_disease_data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # 将异常值替换为边界值
    heart_disease_data[feature] = np.where(heart_disease_data[feature] < lower_bound, lower_bound, heart_disease_data[feature])
    heart_disease_data[feature] = np.where(heart_disease_data[feature] > upper_bound, upper_bound, heart_disease_data[feature])

# 3. 特征编码
# One-Hot Encoding非序数分类特征
heart_disease_data_encoded = pd.get_dummies(heart_disease_data, columns=categorical_features)

# 4. 数据标准化/归一化
# 更新后的数值型特征列表，确保仅包含原始的数值型特征
numeric_features_updated = ['trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'age']
scaler = StandardScaler()
heart_disease_data_encoded[numeric_features_updated] = scaler.fit_transform(heart_disease_data[numeric_features_updated])

# 检查处理后的数据
heart_disease_data_encoded.info()
heart_disease_data_encoded.head()

FileNotFoundError: [Errno 2] No such file or directory: 'heart_disease_uci.csv'

Exploratory data analysis/visualization to gather relevant insights

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 并且确认'data/heart_disease_uci.csv'路径正确，且目标变量列名为'num'


# 设置绘图风格
sns.set(style="whitegrid")

# Part 1: 单变量分析 - 数值型特征的高级分布统计和可视化
numeric_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
plt.figure(figsize=(18, 12))
for i, feature in enumerate(numeric_features):
    plt.subplot(3, 2, i+1)
    sns.histplot(heart_disease_data[feature], kde=True, color='skyblue')
    plt.title(f'Distribution of {feature}', fontsize=14)
    plt.axvline(heart_disease_data[feature].mean(), color='red', linestyle='--', label='Mean')
    plt.axvline(heart_disease_data[feature].median(), color='green', linestyle='-', label='Median')
    plt.legend()
plt.tight_layout()

# 分类特征的分布可视化
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(18, 24))
axes = axes.flatten()  # 展平axes数组以便于迭代
for i, feature in enumerate(categorical_features):
    sns.countplot(x=feature, data=heart_disease_data, ax=axes[i], palette="Set2")
    axes[i].set_title(f'Distribution of {feature}', fontsize=14)
for ax in axes[i+1:]:  # 移除多余的子图位置
    ax.remove()
plt.tight_layout()

# Part 2: 多变量分析 - 散点图矩阵：数值型特征间的关系以及它们与目标变量的关系
sns.pairplot(heart_disease_data[numeric_features + ['num']], hue='num')
plt.show()

# 分类特征与心脏病的关系：百分比堆叠柱状图
for feature in categorical_features:
    pd.crosstab(heart_disease_data[feature], heart_disease_data['num']).apply(lambda r: r/r.sum(), axis=1).plot(kind='bar', stacked=True, colormap='Set2')
    plt.title(f'{feature} vs Heart Disease')
plt.tight_layout()
plt.show()

# 相关性热图：数值型特征与心脏病之间的相关性
plt.figure(figsize=(12, 8))
corr_matrix = heart_disease_data[numeric_features + ['num']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', fmt=".2f", center=0)
plt.title('Correlation Matrix with Heart Disease', fontsize=16)
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# 假设'heart_disease_data'已经包含了你的数据
# 对分类特征进行One-Hot Encoding
heart_disease_data_encoded = pd.get_dummies(heart_disease_data, drop_first=True)

# 分离特征和目标变量
X = heart_disease_data_encoded.drop('num', axis=1)
y = heart_disease_data_encoded['num']

# 数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 实例化随机森林分类器
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型
rf.fit(X_train, y_train)

# 获取特征重要性并与特征名配对
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(10, 8))
sns.barplot(x=feature_importances['importance'], y=feature_importances.index)
plt.title('Feature Importances')
plt.show()


the use of machine learning techniques to solve specific problem

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# 假设已经加载了数据到 heart_disease_data DataFrame
# data_path = '/path/to/your/data.csv'
# heart_disease_data = pd.read_csv(data_path)

# 转换目标变量为二元变量
heart_disease_data['heart_disease'] = (heart_disease_data['num'] > 0).astype(int)

# 定义特征和目标变量
X = heart_disease_data.drop(['heart_disease', 'num'], axis=1)  # 假设'drop'中的列名正确
y = heart_disease_data['heart_disease']

# 分离数值型和分类型特征
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# 创建预处理转换器
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# 创建预处理步骤
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 应用预处理和PCA
pipeline = make_pipeline(preprocessor, PCA(n_components=2))
X_pca = pipeline.fit_transform(X)

# 可视化前两个主成分
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis')
plt.title('PCA of Heart Disease Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Heart Disease', loc='best')
plt.show()


#### trestbps (resting blood pressure): The distribution is slightly right-skewed, suggesting that a subset of the population has high blood pressure, which is a known risk factor for heart disease.

#### Actionable Insight: Raise awareness about the importance of monitoring blood pressure and the potential need for lifestyle changes or medication among those with elevated readings.

#### chol (cholesterol levels): The distribution shows that high cholesterol levels are quite common in the dataset.

#### Actionable Insight: Screen for high cholesterol, especially in individuals with other risk factors, and promote dietary changes that can help manage cholesterol levels.

In [None]:
!pip install xgboost

#### fbs (fasting blood sugar): There are significantly more individuals with fasting blood sugar below 120 mg/dl, which is a healthy range.
#### Actionable Insight: Continue to educate the public on the importance of diet and exercise in managing blood sugar levels.

#### exang (exercise-induced angina): A smaller proportion of individuals experience exercise-induced chest pain.
#### Actionable Insight: Encourage those with exercise-induced angina to seek further evaluation, as this can be an indicator of underlying heart disease.

In [None]:
import sys
print(sys.executable)
print(sys.version)
import xgboost as xgb
print(xgb.__version__)



#### Positive correlations with num (diagnosis of heart disease) include cp_asymptomatic, oldpeak (ST depression induced by exercise), and exang. These variables show a moderate relationship with the presence of heart disease.
#### Actionable Insight: For patients who report asymptomatic chest pain or those who exhibit ST depression and exercise-induced angina, a more thorough cardiac evaluation may be warranted.

#### Negative correlations with num are seen with thalach (maximum heart rate achieved), and cp_atypical angina. Higher heart rates achieved without chest pain during exercise seem to be associated with a lower incidence of heart disease.
#### Actionable Insight: Encourage regular physical activity, as it appears to be protective against heart disease. Individuals able to achieve higher heart rates without pain may be at lower risk and have better cardiovascular health.

#### Surprisingly, chol shows a negative correlation with heart disease in this analysis, which is contrary to common medical understanding. It could be due to confounding factors or the particular characteristics of the dataset.
#### Actionable Insight: Investigate individual cases where cholesterol levels are high but the risk of heart disease is low to understand other protective factors that may be present.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 假设 heart_disease_data 是已经加载的 DataFrame，并且目标列已经正确处理

# 分离特征和目标变量
X = heart_disease_data.drop(['heart_disease'], axis=1)
y = heart_disease_data['heart_disease']

# 识别数值型和分类特征
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 创建预处理转换器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 创建模型训练管道
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 设置超参数搜索范围
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

# 运行网格搜索
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# 使用最佳参数在测试集上进行评估
y_pred = grid_search.predict(X_test)
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


#### Analysis for Cholesterol

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
import shap
import matplotlib.pyplot as plt



# 从特征集中排除'id'列（如果存在）
if 'id' in heart_disease_data.columns:
    heart_disease_data.drop('id', axis=1, inplace=True)

# 从特征集中排除原始的目标变量'num'列（如果还未排除）
if 'num' in heart_disease_data.columns:
    heart_disease_data.drop('num', axis=1, inplace=True)

# 分割特征和目标变量
X = pd.get_dummies(heart_disease_data.drop('heart_disease', axis=1))  # 对分类变量进行独热编码
y = heart_disease_data['heart_disease']

# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练XGBoost模型
model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# 使用SHAP解释模型的预测
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# 可视化SHAP值的水流图
shap.initjs()  # 初始化JavaScript环境
shap.plots.waterfall(shap_values[0], max_display=14)

# 如果想要查看更多样本的SHAP值
# for i in range(X_test.shape[0]):
#     shap.plots.waterfall(shap_values[i], max_display=14)



In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练XGBoost模型
model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# 获取特征重要性得分并可视化
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y=feature_importances.index, data=feature_importances)
plt.title('Feature Importance')
plt.show()

# 计算SHAP值
explainer = shap.Explainer(model)
shap_values = explainer(X_train)

# 可视化SHAP值的汇总图
shap.summary_plot(shap_values, X_train)

# 获取所有特征的平均SHAP值并可视化
shap_sum = np.abs(shap_values.values).mean(axis=0)
importance_df = pd.DataFrame([X_train.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['feature', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='shap_importance', y='feature', data=importance_df)
plt.title('Average SHAP Importance')
plt.show()


### General Actionable Insights:

#### Set up community health screenings focusing on blood pressure, cholesterol, and glucose levels, targeting individuals who fall into higher-risk categories based on age and reported symptoms.
#### Develop personalized intervention programs that include diet, exercise, and medication where necessary, particularly for those with higher resting blood pressure, cholesterol, and asymptomatic chest pain.
#### Initiate educational programs on recognizing signs and symptoms of heart disease, emphasizing the importance of regular health checks.

### Actionable Insights:
#### Regular Checkups: Individuals aged 50 and above should undergo regular cholesterol checkups, as cholesterol levels can vary significantly within this age range and are crucial for heart disease prevention.
#### Healthy Lifestyle: Promote a healthy lifestyle across all age groups, but especially target educational campaigns for middle-aged individuals, where an increase in cholesterol levels is more pronounced.
#### Targeted Interventions: For individuals in the age group where cholesterol levels start rising (based on the boxplot), healthcare providers should consider targeted interventions, such as dietary modifications or physical activity enhancements.
#### These insights are derived from the data and should be validated with clinical expertise for real-world applications.

### Data Preparation for Modeling

### Model Building and Evaluation

## Classification Report Insights:
### Class 0 (No Heart Disease) Prediction is Relatively Good:
#### The precision, recall, and F1-score for class 0 are reasonably high. This indicates that the model is good at identifying patients without heart disease.

### Class 1 (Slight Heart Disease) Prediction is Moderate:
#### Moderate scores across precision, recall, and F1 suggest the model has room for improvement in identifying patients with slight heart disease.

### Classes 2, 3, and 4 (Moderate to Severe Heart Disease) Prediction is Poor:
#### Low precision and recall values for classes 2, 3, and 4 suggest that the model struggles to correctly identify patients with moderate to severe heart disease.

### Imbalance in Dataset:
#### The 'support' column, which indicates the number of true instances for each class, shows that the dataset may be imbalanced, with fewer instances of more severe heart disease (classes 3 and 4).

## Confusion Matrix Insights:
### High True Negatives for Class 0:
#### A significant number of true negatives for class 0 (no heart disease) indicate that the model can distinguish non-cases well.

### Misclassifications Among Classes:
#### There are misclassifications, particularly between classes 0 and 1, and between classes 2, 3, and 4, indicating potential confusion in distinguishing different levels of heart disease severity.

### Limited Data for Severe Cases:
#### There are very few instances for class 4, which might lead to poor model training for this class due to insufficient data.

## Actionable Insights:
### Model Improvement:
#### Investigate feature engineering or more complex models to improve classification of classes 1 to 4, as the current model performance decreases with increasing severity of heart disease.

### Data Collection:
#### Additional data for underrepresented classes (especially severe heart disease cases) should be collected to improve the model's learning capability for these classes.

### Cost-sensitive Learning:
#### Given that misclassifications of actual heart disease cases as 'no disease' can be dangerous, consider employing cost-sensitive learning techniques that penalize these types of errors more heavily.

### Clinical Corroboration:
#### Work closely with clinicians to validate the model's predictions and incorporate clinical expertise into the decision-making process.

### Further Diagnostic Testing:
#### For patients who are borderline between classes, consider recommending further diagnostic tests to confirm the severity of heart disease.

### Machine Learning Techniques

### 1. Identifying the predictor variables and Building the Regression Model by preparing the data, building preprocessing pipeline and train the linear regression model.

### The accuracy of a machine learning model is a measure of its ability to correctly classify instances from the dataset. In this case, we trained a machine learning model using various medical and demographic features to predict the presence of heart disease in patients. The accuracy of the model, which is approximately 54.71%, indicates that it correctly predicts the presence or absence of heart disease in about 54.71% of the cases in the test dataset.

### Decision Tree
### Finding out which variables are the most impactful in predicting the heart disease

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sb

# Predict the Response corresponding to Predictors
y_train_pred = dectree.predict(X_train)

# Print the Classification Accuracy
print("Train Data")
print("Accuracy  :\t", dectree.score(X_train, y_train))
print()

# Print the Accuracy Measures from the Confusion Matrix
cmTrain = confusion_matrix(y_train, y_train_pred)
tpTrain = cmTrain[1][1]  # True Positives: Good (1) predicted Good (1)
fpTrain = cmTrain[0][1]  # False Positives: Bad (0) predicted Good (1)
tnTrain = cmTrain[0][0]  # True Negatives: Bad (0) predicted Bad (0)
fnTrain = cmTrain[1][0]  # False Negatives: Good (1) predicted Bad (0)

print("TPR Train :\t", (tpTrain / (tpTrain + fnTrain)))
print("TNR Train :\t", (tnTrain / (tnTrain + fpTrain)))
print()

print("FPR Train :\t", (fpTrain / (tnTrain + fpTrain)))
print("FNR Train :\t", (fnTrain / (tpTrain + fnTrain)))

# Plot the two-way Confusion Matrix
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot=True, fmt=".0f", annot_kws={"size": 18})
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
# Predict the Response corresponding to Predictors
y_test_pred = dectree.predict(X_test)

# Print the Classification Accuracy
print("Test Data")
print("Accuracy  :\t", dectree.score(X_test, y_test))
print()

# Print the Accuracy Measures from the Confusion Matrix
cmTest = confusion_matrix(y_test, y_test_pred)
tpTest = cmTest[1][1]  # True Positives: Good (1) predicted Good (1)
fpTest = cmTest[0][1]  # False Positives: Bad (0) predicted Good (1)
tnTest = cmTest[0][0]  # True Negatives: Bad (0) predicted Bad (0)
fnTest = cmTest[1][0]  # False Negatives: Good (1) predicted Bad (0)

print("TPR Test :\t", (tpTest / (tpTest + fnTest)))
print("TNR Test :\t", (tnTest / (tnTest + fpTest)))
print()

print("FPR Test :\t", (fpTest / (fpTest + tnTest)))
print("FNR Test :\t", (fnTest / (fnTest + tpTest)))

# Plot the two-way Confusion Matrix
sb.heatmap(confusion_matrix(y_test, y_test_pred),
           annot=True, fmt=".0f", annot_kws={"size": 18})
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


### Random Forest Classifer

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load the dataset (replace "heart_disease_uci.csv" with the actual file path)
heart_disease_df = pd.read_csv("heart_disease_uci.csv")

# Select predictor variables and target variable
X = heart_disease_df.drop(columns=['num'])  # Assuming 'num' is the target column
y = heart_disease_df['num']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace NaNs with most frequent value
            ('onehot', OneHotEncoder(handle_unknown='ignore'))    # Encode categorical features
        ]), categorical_cols)
    ])

# Create the Random Forest object
rforest = RandomForestClassifier(n_estimators=100, max_depth=4)

# Build pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rforest)])

# Stratified k-fold verification for random forest classifier
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
cv_results = cross_val_score(pipeline, X_train, y_train, cv=skf)
print("Cross-validation results:", cv_results)
print("Mean CV score:", cv_results.mean())

# Fit Random Forest on Train Data
pipeline.fit(X_train, y_train)

# Predict on Test Data
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = pipeline.score(X_test, y_test)
print("Accuracy on test data:", accuracy)

# Print confusion matrix and classification report
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred, zero_division=0))


FileNotFoundError: [Errno 2] No such file or directory: 'heart_disease_uci.csv'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load your heart disease dataset (replace "heart_disease_df.csv" with your actual file path)
heart_disease_df = pd.read_csv("heart_disease_uci.csv")

# Select predictor variables and target variable
X = heart_disease_df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                      'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']]
y = heart_disease_df['num']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Define categorical and numerical columns
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
numerical_cols = X.columns.drop(categorical_cols)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_cols),  # Impute missing values for numerical columns
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute most frequent value for categorical columns
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)  # One-hot encode categorical columns
    ])

# Create the Random Forest object
rforest = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)

# Pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rforest)])

# Stratified k-fold verification for random forest classifier
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
cv_results = cross_val_score(pipeline, X_train, y_train, cv=skf)
print("Cross-validation results:", cv_results)
print("Mean CV score:", cv_results.mean())

# Fit Random Forest on Train Data
pipeline.fit(X_train, y_train)

# Predict on Test Data
y_pred = pipeline.predict(X_test)

# Now you can evaluate the model and perform further analysis as needed


### Trying to improve accuracy by hyperparameter tuning

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical columns
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create pipeline with preprocessor and classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Define the hyperparameter grid
param_grid = {
    'classifier__n_estimators': np.arange(100, 1001, 100),  # Number of trees: 100, 200, ..., 1000
    'classifier__max_depth': np.arange(2, 11)               # Depth of trees: 2, 3, 4, ..., 10
}

# Train the models using cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Fetch the best model or the best set of hyperparameters
best_model = grid_search.best_estimator_


# Print the score (accuracy) of the best model after cross-validation
best_score = grid_search.best_score_
print("Best Score:", best_score)
