In [1]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ucimlrepo
from ucmlrepo import fetch_ucirepo 
import numpy as np
from sklearn.datasets import fetch_openml
import csv

ModuleNotFoundError: No module named 'pandas'

In [None]:

data = pd.read_csv('./agaricus-lepiota.data', header=None)

mushroom = fetch_openml(name='mushroom', version=1)
names = pd.DataFrame(mushroom.data, columns=mushroom.feature_names).columns

data = data[~data.isin(['?']).any(axis=1)]

two_dimensional_array = data.values.tolist()

y = [row[0] for row in two_dimensional_array]
X = [row[1:] for row in two_dimensional_array]

for x_val, y_val in zip(X, y):
    print(x_val, y_val)

print(names)




In [None]:
import numpy as np

def entropy(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    return -np.sum(probabilities * np.log2(probabilities))

def gain_ratio(features, target):
    total_entropy = entropy(target)
    split_info = entropy(features)
    unique_values, counts = np.unique(features, return_counts=True)
    weighted_entropy = np.sum([(count / len(features)) * entropy(target[features == value]) for value, count in zip(unique_values, counts)])
    information_gain = total_entropy - weighted_entropy
    gain_ratio = information_gain / (split_info if split_info != 0 else 1e-10) # to avoid division by zero
    return gain_ratio

X_array = np.array(X)
y_array = np.array(y)

gain_ratios = []

for feature in X_array.T:  # Итерируемся по каждому столбцу (признаку)
    gain_ratios.append(gain_ratio(feature, y_array))


sorted_data = sorted(zip(names, gain_ratios), key=lambda x: x[1], reverse=True)

for feature_name, gain_ratio_value in sorted_data:
    print(f"{feature_name}: {gain_ratio_value}")

In [None]:
index = []
for i in range (0, len(names)):
    index.append(names.get_loc(sorted_data[i][0]))

# тут выбираем признаки так, что бы было как можно больше уникальных строк
X_array_selected = X_array[:, [index[0], index[1], index[2], index[3], index[4], index[5], 
      index[6], index[7], index[8], index[13], index[17], index[18], index[19], index[20]]]

X_and_y = np.hstack((X_array_selected, y_array.reshape(-1, 1)))

# убираем повторяющиеся строки
X_and_y = np.unique(X_and_y, axis=0)
print(X_and_y[0])
print(X_array_selected.shape)

y_array_filtered = X_and_y[:, -1].astype(y_array.dtype)
X_array_selected = X_and_y[:, :-1]

print(y_array_filtered.shape)
print(X_array_selected.shape)


# данные для нейронки
for x_val, y_val in zip(X_array_selected, y_array_filtered):
    print(x_val, y_val)


In [None]:
# переводим буквы в числа
X_numeric = [[ord(char)-ord('a') for char in row] for row in X_array_selected]
X_numeric = np.array(X_numeric)
X_numeric = (X_numeric - X_numeric.mean()) / X_numeric.std()

y_numeric = np.zeros((len(y_array_filtered), 1), dtype=int)

for i, value in enumerate(y_array_filtered):
    if value == 'p':
        y_numeric[i] = [0]
    elif value == 'e':
        y_numeric[i] = [1]

for x_val, y_val in zip(X_numeric, y_numeric):
    print(x_val, y_val)


In [None]:
# обучаем

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_derivative(x):
    return x * (1 - x)


def initialize_weights_and_biases(layer_sizes):
    np.random.seed(1)
    num_layers = len(layer_sizes)
    weights = []
    biases = []
    for i in range(1, num_layers):
        weights.append(np.random.uniform(size=(layer_sizes[i - 1], layer_sizes[i])))
        biases.append(np.random.uniform(size=(1, layer_sizes[i])))
    return weights, biases


def forward_propagation(X, weights, biases):
    num_layers = len(weights) + 1
    layer_outputs = [X]
    for i in range(num_layers - 1):
        layer_input = np.dot(layer_outputs[-1], weights[i]) + biases[i]
        layer_output = sigmoid(layer_input)
        layer_outputs.append(layer_output)
    return layer_outputs


def backward_propagation(X, y, layer_outputs, weights, biases, learning_rate):
    num_layers = len(weights)
    num_samples = X.shape[0]
    output_layer_output = layer_outputs[-1]
    output_error = y - output_layer_output
    output_delta = output_error * sigmoid_derivative(output_layer_output)
    for i in range(num_layers - 1, -1, -1):
        if i == num_layers - 1:
            layer_delta = output_delta
        else:
            layer_error = layer_delta.dot(weights[i + 1].T)
            layer_delta = layer_error * sigmoid_derivative(layer_outputs[i + 1])
        weights[i] += layer_outputs[i].T.dot(layer_delta) * learning_rate / num_samples
        biases[i] += np.sum(layer_delta, axis=0, keepdims=True) * learning_rate / num_samples
    return weights, biases


def train_network(X, y, layer_sizes, learning_rate, epochs):
    weights, biases = initialize_weights_and_biases(layer_sizes)
    for epoch in range(epochs):
        layer_outputs = forward_propagation(X, weights, biases)
        weights, biases = backward_propagation(X, y, layer_outputs, weights, biases, learning_rate)
    for i in range (0, len(layer_outputs[-1])):
        print(y[i], layer_outputs[-1][i])
    print("Training complete!")
    return weights, biases


X_train = X_numeric[:1600]
y_train = y_numeric[:1600]

learning_rate = 0.7
epochs = 1000

layer_sizes = [X_train.shape[1], 15, 1]

weights, biases = train_network(X_train, y_train, layer_sizes, learning_rate, epochs)



In [None]:
X_test = X_numeric[1600:]
y_test = y_numeric[1600:]


def predict(X, y, weights, biases):
    layer_outputs = forward_propagation(X, weights, biases)
    predictions = []
    for output in layer_outputs[-1]:
        prediction = np.where(output < 0.5, 0, 1)
        predictions.append(prediction)
    result = np.column_stack((y, predictions))
    preds = np.column_stack((y, layer_outputs[-1]))
    return result, preds

predictions, p = predict(X_test, y_test, weights, biases)
count = sum(1 for pred in predictions if pred[0] == pred[1])


In [None]:
actual = predictions[:, 0]
predicted = predictions[:, 1]

TP = np.sum(np.logical_and(actual == 1, predicted == 1))
FP = np.sum(np.logical_and(actual == 0, predicted == 1))
FN = np.sum(np.logical_and(actual == 1, predicted == 0))
TN = np.sum(np.logical_and(actual == 0, predicted == 0))

precision = TP/(FP+TP)
recall = TP/(FN+TP)

print("Precision = ", precision)
print("Recall = ", recall)
print("F1-score = ", 2*recall*precision/(recall+precision))