In [297]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import autograd.numpy as np_   # Thinly-wrapped version of Numpy
from autograd import grad
import numpy as np

In [298]:
df = pd.read_csv('dados_avc.csv')
df.head(2)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1


In [299]:
# Preencher valores ausentes em 'bmi' com a média
df['bmi'].replace('N/A', float('nan'), inplace=True)
df['bmi'].fillna(df['bmi'].mean(), inplace=True)

# Converter colunas categóricas usando one-hot encoding
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)
df.head(2)


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,True,False,True,False,True,False,False,True,True,False,False
1,51676,61.0,0,0,202.21,28.893237,1,False,False,True,False,False,True,False,False,False,True,False


In [300]:
X = df.drop(['stroke', 'id','age', 'bmi', 'avg_glucose_level'], axis=1)
Y = df['stroke']


In [301]:
X.head(1)

Unnamed: 0,hypertension,heart_disease,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,True,False,True,False,True,False,False,True,True,False,False


In [302]:
## converting the 0 to -1
X = X.replace(0, -1)

## convering the True to 1 and False to -1
Y = Y.replace(0, -1)
X = X.replace(True, 1)
X = X.replace(False, -1)

print(X.head(1))
print(Y.tail(10))

   hypertension  heart_disease  gender_Male  gender_Other  ever_married_Yes   
0            -1              1            1            -1                 1  \

   work_type_Never_worked  work_type_Private  work_type_Self-employed   
0                      -1                  1                       -1  \

   work_type_children  Residence_type_Urban  smoking_status_formerly smoked   
0                  -1                     1                               1  \

   smoking_status_never smoked  smoking_status_smokes  
0                           -1                     -1  
5100   -1
5101   -1
5102   -1
5103   -1
5104   -1
5105   -1
5106   -1
5107   -1
5108   -1
5109   -1
Name: stroke, dtype: int64


In [303]:
print(X.dtypes)
print(Y.dtypes)

hypertension                      int64
heart_disease                     int64
gender_Male                       int64
gender_Other                      int64
ever_married_Yes                  int64
work_type_Never_worked            int64
work_type_Private                 int64
work_type_Self-employed           int64
work_type_children                int64
Residence_type_Urban              int64
smoking_status_formerly smoked    int64
smoking_status_never smoked       int64
smoking_status_smokes             int64
dtype: object
int64


In [304]:
## converting int64 to float64
X = X.astype('float64')
Y = Y.astype('float64')
print(Y)

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
5105   -1.0
5106   -1.0
5107   -1.0
5108   -1.0
5109   -1.0
Name: stroke, Length: 5110, dtype: float64


In [305]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)


In [306]:
def loss( parametros ):
    w, b, x, y = parametros
    est = w.T @ x + b
    mse = np_.mean( (est - y)**2)
    return mse

g = grad(loss)
x = np.array(X_train).T
y = np.array(Y_train)


In [307]:
w = np.random.randn(x.shape[0], 1)
b = 0.0
alpha = 10**-2

for n in range(15000):
    grad_ = g( (w, b, x, y) )
    w -= alpha*grad_[0]
    b -= alpha*grad_[1]


In [316]:
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
Y_pred = np.round(Y_pred)




In [317]:
print(model.coef_)

[ 6.80388952e-02  1.03230459e-01 -2.88057690e-03 -4.16333634e-17
  1.92982989e-02 -1.88026012e-02  1.61393472e-03  7.47992421e-03
 -1.66429928e-02  9.47675898e-03  7.01765851e-03 -5.84214650e-03
 -1.98371923e-03]


In [324]:
## create tuples with value of w and X column name
w_tuples = []
w_lin_model = []
for i in range(len(w)):
    w_tuples.append((w[i][0], X.columns[i]))
    w_lin_model.append((model.coef_[i], X.columns[i]))

## sort the tuples
w_tuples.sort(reverse=True)
w_lin_model.sort(reverse=True)


In [325]:
def accuracy(y_test, y_est):
    return np.mean(np.sign(y_test)==np.sign(y_est))

In [326]:
print(w_tuples)

x = np.array(X_test).T
y = np.array(Y_test)

y_est = w.T @ x + b
print(f"Acurácia do modelo: {accuracy(y, y_est) * 100:.2f}%")

print(w_lin_model)

print(accuracy_score(Y_test, Y_pred))

[(0.10343058608609072, 'heart_disease'), (0.06814854628850556, 'hypertension'), (0.021516793594549197, 'gender_Other'), (0.019451066378811967, 'ever_married_Yes'), (0.009461622971978499, 'Residence_type_Urban'), (0.007866744183314674, 'work_type_Self-employed'), (0.007239549491757728, 'smoking_status_formerly smoked'), (0.001997292968922665, 'work_type_Private'), (-0.0017315315654562846, 'smoking_status_smokes'), (-0.0029128404897056648, 'gender_Male'), (-0.00568551604886342, 'smoking_status_never smoked'), (-0.012713086346552467, 'work_type_Never_worked'), (-0.01591282390547233, 'work_type_children')]
Acurácia do modelo: 94.19%
[(0.10323045935489358, 'heart_disease'), (0.06803889521842481, 'hypertension'), (0.019298298933304286, 'ever_married_Yes'), (0.009476758984370287, 'Residence_type_Urban'), (0.007479924208770146, 'work_type_Self-employed'), (0.00701765851320971, 'smoking_status_formerly smoked'), (0.0016139347151004155, 'work_type_Private'), (-4.163336342344337e-17, 'gender_Othe

In [327]:
from sklearn.tree import DecisionTreeClassifier

# Treinar a árvore de decisão
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Calcular a importância das características
importances = tree.feature_importances_

# Emparelhar os nomes das características com suas importâncias
feature_importances = list(zip(X_train.columns, importances))

# Ordenar as características pela importância
sorted_feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

print("Importância das características:")
feature_importance_df = pd.DataFrame(sorted_feature_importances, columns=['Feature', 'Importance'])



feature_importance_df.head(15)


Importância das características:


Unnamed: 0,Feature,Importance
0,Residence_type_Urban,0.168211
1,gender_Male,0.138954
2,ever_married_Yes,0.137741
3,heart_disease,0.116868
4,smoking_status_never smoked,0.097313
5,smoking_status_formerly smoked,0.089234
6,hypertension,0.079136
7,work_type_Self-employed,0.070123
8,work_type_Private,0.054938
9,smoking_status_smokes,0.04365
