In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

import researchpy as rp

In [2]:
url = "http://yustiks.ru/dataset/cardio_train.csv"
data = pd.read_csv(url,sep=";")
data.tail()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1
69999,99999,20540,1,170,72.0,120,80,2,1,0,0,1,0


In [3]:
y = data.iloc[:,-1].values
X = data.iloc[:,1:-2].values
f_names = data.columns[1:-2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
sc = MinMaxScaler(feature_range=(-1,1))
X_train_norm = sc.fit_transform(X_train)
X_test_norm = sc.transform(X_test)

In [5]:
model = RandomForestClassifier(n_estimators=100, n_jobs = 4)
model.fit(X_train_norm,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [6]:
model_pred = model.predict(X_test_norm)

print(classification_report(y_test, model_pred))
# model.score(X_test_norm, y_test)

              precision    recall  f1-score   support

           0       0.71      0.73      0.72      6978
           1       0.73      0.70      0.71      7022

    accuracy                           0.72     14000
   macro avg       0.72      0.72      0.72     14000
weighted avg       0.72      0.72      0.72     14000



In [7]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X.shape[1]):
    print(f"{f+1}. feature '{f_names[f]}' ({importances[indices[f]]})")

Feature ranking:
1. feature 'age' (0.322203858162854)
2. feature 'gender' (0.17690681025828406)
3. feature 'height' (0.17439080605360305)
4. feature 'weight' (0.15498756384203327)
5. feature 'ap_hi' (0.08506560537893809)
6. feature 'ap_lo' (0.03815454086997226)
7. feature 'cholesterol' (0.016862032982243702)
8. feature 'gluc' (0.01601734251654094)
9. feature 'smoke' (0.008175722516253007)
10. feature 'alco' (0.007235717419277697)


In [8]:
col_name = "age"
descriptives, results = rp.ttest(data[data["cardio"]==0][col_name], data[data["cardio"]==1][col_name])
print(descriptives)
print(results)

   Variable        N          Mean           SD         SE     95% Conf.  \
0       age  35021.0  18881.623711  2473.925337  13.219722  18855.712637   
1       age  34979.0  20056.813031  2315.927546  12.382869  20032.542214   
2  combined  70000.0  19468.865814  2467.251667   9.325335  19450.588178   

       Interval  
0  18907.534786  
1  20081.083848  
2  19487.143451  
          Independent t-test     results
0  Difference (age - age) =   -1175.1893
1      Degrees of freedom =   69998.0000
2                       t =     -64.8769
3   Two side test p value =       0.0000
4  Difference < 0 p value =       0.0000
5  Difference > 0 p value =       1.0000
6               Cohen's d =      -0.4904
7               Hedge's g =      -0.4904
8           Glass's delta =      -0.4750
9                       r =       0.2382


**ВЫВОД**

P значение меньше порогового (0.05). Следовательно выборки статически различны.
Исходя из значения средних по выборкам, можно предположить, что с увеличением возраста возрастает вероятность болезней сердца