In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('dados_avc.csv')

df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

X = df.drop(['stroke', 'id','age', 'bmi','avg_glucose_level'], axis=1)
y = df['stroke']

X.head(15)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.astype(float))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_train = y_train.astype(float)
y_test = y_test.astype(float)

print(X_train.shape)
print(y_train.shape)


In [None]:
import autograd.numpy as np_  # Thinly-wrapped version of Numpy
from autograd import grad
import numpy as np


def loss(parametros):
    w, b, pontos, val = parametros
    est = np_.dot(pontos, w) + b
    mse = np_.mean((est - val.reshape(-1, 1).astype(float))**2)  # Assegurar que val seja float
    return mse

g = grad(loss)

pontos = X_train
alvos = y_train

w = np.random.randn(X_train.shape[1], 1).astype(float)  # Assegurar que w seja float
b = np.float64(0.0)  # Assegurar que b seja float

alpha = 10 ** -3

for n in range(10000):
    grad_ = g((w, b, pontos, alvos))
    w -= alpha * grad_[0]
    b -= alpha * grad_[1]

print(w)
print(b)





In [None]:
# Criar um DataFrame com os coeficientes e os nomes das features
coef_df = pd.DataFrame(w, columns=['coeficiente'], index=X.columns)

# Ordenar o DataFrame pelos maiores coeficientes em valor absoluto
coef_df['coef_abs'] = coef_df['coeficiente'].apply(np.abs)
coef_df = coef_df.sort_values(by='coef_abs', ascending=False).drop('coef_abs', axis=1)

(coef_df.head(20))


In [None]:
from sklearn.linear_model import LinearRegression

# Converter os dados para float
X_train = X_train.astype(float)
y_train = y_train.astype(float)

# Criar e treinar o modelo de regressão linear
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Exibir os coeficientes e o intercept
print(lin_reg.coef_)
print(lin_reg.intercept_)




In [None]:
# Criar um DataFrame com os coeficientes e os nomes das features
coef_df_sklearn = pd.DataFrame(lin_reg.coef_, columns=['coeficiente'], index=X.columns)

# Ordenar o DataFrame pelos maiores coeficientes em valor absoluto
coef_df_sklearn['coef_abs'] = coef_df_sklearn['coeficiente'].apply(np.abs)
coef_df_sklearn = coef_df_sklearn.sort_values(by='coef_abs', ascending=False).drop('coef_abs', axis=1)

(coef_df_sklearn).head()


In [None]:
def calc_accuracy(y_true, y_pred, threshold=0.5):
    y_pred_binary = (y_pred > threshold).astype(int)
    return accuracy_score(y_true, y_pred_binary)

# Regressão linear com numpy
y_pred_numpy = np.dot(X_test.astype(float), w) + b
accuracy_numpy = calc_accuracy(y_test.astype(int), y_pred_numpy)
print(f"Acurácia do modelo de regressão linear com numpy: {accuracy_numpy * 100:.2f}%")

In [None]:
# Regressão linear com scikit-learn
y_pred_sklearn = lin_reg.predict(X_test.astype(float))
accuracy_sklearn = calc_accuracy(y_test.astype(int), y_pred_sklearn)
print(f"Acurácia do modelo de regressão linear com scikit-learn: {accuracy_sklearn * 100:.2f}%")

In [None]:
model = LogisticRegression(max_iter=1000)  # Aumentar max_iter se necessário
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {accuracy * 100:.2f}%")


In [None]:
# Pegar os coeficientes (pesos) do modelo treinado
coeficientes = model.coef_[0]

print(model.classes_)
# Criar um DataFrame com os nomes das características e seus coeficientes correspondentes
coeficientes_features = pd.DataFrame({"Feature": X.columns, "Coeficiente": coeficientes})

# Calcular o valor absoluto dos coeficientes
coeficientes_features['Coeficiente_abs'] = coeficientes_features['Coeficiente'].apply(abs)

# Ordenar os coeficientes em ordem decrescente de valor absoluto
coeficientes_features = coeficientes_features.sort_values("Coeficiente_abs", ascending=False)

# Exibir as características com os maiores coeficientes (em valor absoluto)
coeficientes_features.head(40)


In [None]:
# SE DROPAR A COLUNA DE MORAR NA CIDADE OU NÃO, MUDA A ACURÁCIA DO MODELO?
X = df.drop(['stroke', 'Residence_type_Urban'], axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)  # Aumentar max_iter se necessário
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {accuracy * 100:.2f}%")

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Treinar a árvore de decisão
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Calcular a importância das características
importances = tree.feature_importances_

# Emparelhar os nomes das características com suas importâncias
feature_importances = list(zip(X_train.columns, importances))

# Ordenar as características pela importância
sorted_feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

print("Importância das características:")
feature_importance_df = pd.DataFrame(sorted_feature_importances, columns=['Feature', 'Importance'])



feature_importance_df.head(15)
