In [1]:
import os
import math
import json
import gzip
import time
import calendar
from datetime import datetime
import numpy as np
import pandas as pd
from datetime import date
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
SETS = './sets/' # папка с файлами с наборами

In [2]:
def default_serializer(obj): # для правильного преобразования даты в ISO формат
    if isinstance(obj, (date)):
        return obj.isoformat()
    raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')
    

def load_dataset(filename): 
    if os.path.exists(SETS + filename + '.gz'):
        with gzip.open(SETS + filename + '.gz', 'rb') as gzip_ref:
            return pd.DataFrame(json.load(gzip_ref))
        
    
def save_dataset(filename):
    data = globals().get(filename)
    if data is not None:
        data = data.to_dict(orient='records')
        json_data = json.dumps(data, ensure_ascii=False, default=default_serializer)
        with gzip.open(SETS + filename + '.gz', 'wb') as gzip_file:
            gzip_file.write(json_data.encode('utf-8'))
        print(f'Сохранено {len(data)} записей в {filename}.gz')
        
pd.set_option('display.max_colwidth', None) # для отображения полного текста в ячейках
pd.set_option('display.max_columns', None) 

In [3]:
exams_ecology = load_dataset('exams_ecology')
exams_ecology['year'] = pd.to_datetime(exams_ecology['start']).dt.year

In [4]:
%%time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.utils.data import DataLoader, TensorDataset

# Load data
data = exams_ecology

# Define features and target variable
X = data[['global_id', 'year', 'stroi', 'roads']]
y = data['stupid']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert data to tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Create datasets and data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define a more complex neural network
class ImprovedNeuralNetwork(nn.Module):
    def __init__(self):
        super(ImprovedNeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(4, 512)  # Increased neurons in the first layer
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.4)  # Increased dropout rate

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.4)

        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout3 = nn.Dropout(0.3)

        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dropout4 = nn.Dropout(0.3)

        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 16)
        self.fc7 = nn.Linear(16, 8)
        self.fc8 = nn.Linear(8, 1)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)

        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)

        x = torch.relu(self.bn4(self.fc4(x)))
        x = self.dropout4(x)

        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        x = torch.relu(self.fc7(x))
        x = self.fc8(x)
        
        return x

# Initialize the improved model
model = ImprovedNeuralNetwork()

# Hyperparameters
learning_rate = 0.0001  # Lower learning rate
epochs = 500  # Increase the number of epochs

# Initialize loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(epochs):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

# Predict on the test set
model.eval()
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)

# Convert predictions to numpy for metric evaluation
y_pred = y_pred_tensor.numpy()

# Evaluate the model
mse_improved = mean_squared_error(y_test, y_pred)
mae_improved = mean_absolute_error(y_test, y_pred)
r2_improved = r2_score(y_test, y_pred)

print(f'MSE: {mse_improved}')
print(f'MAE: {mae_improved}')
print(f'R2: {r2_improved}')

MSE: 2.1224281170699344
MAE: 0.8747404421361281
R2: -0.014977280786252267


In [5]:
# %%time
# import tensorflow as tf
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# # Load data
# data = exams_ecology

# # Define features and target variable
# X = data[['global_id', 'year', 'stroi', 'roads']]
# y = data['stupid']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Scale the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Build the TensorFlow model
# def build_model():
#     model = tf.keras.Sequential([
#         tf.keras.layers.Dense(512, input_shape=(4,), activation='relu'),
#         tf.keras.layers.BatchNormalization(),
#         tf.keras.layers.Dropout(0.4),
        
#         tf.keras.layers.Dense(256, activation='relu'),
#         tf.keras.layers.BatchNormalization(),
#         tf.keras.layers.Dropout(0.4),
        
#         tf.keras.layers.Dense(128, activation='relu'),
#         tf.keras.layers.BatchNormalization(),
#         tf.keras.layers.Dropout(0.3),
        
#         tf.keras.layers.Dense(64, activation='relu'),
#         tf.keras.layers.BatchNormalization(),
#         tf.keras.layers.Dropout(0.3),
        
#         tf.keras.layers.Dense(32, activation='relu'),
#         tf.keras.layers.Dense(16, activation='relu'),
#         tf.keras.layers.Dense(8, activation='relu'),
#         tf.keras.layers.Dense(1)  # Output layer
#     ])
    
#     optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
#     model.compile(optimizer=optimizer, loss='mse')
    
#     return model

# # Initialize the model
# model = build_model()

# # Callbacks
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# # Train the model
# history = model.fit(
#     X_train_scaled, y_train,
#     epochs=200,
#     batch_size=32,
#     validation_split=0.2,
#     callbacks=[early_stopping, reduce_lr],
#     verbose=1
# )

# # Predict on the test set
# y_pred = model.predict(X_test_scaled)

# # Evaluate the model
# mse_tf = mean_squared_error(y_test, y_pred)
# mae_tf = mean_absolute_error(y_test, y_pred)
# r2_tf = r2_score(y_test, y_pred)

# print(f'MSE (TensorFlow): {mse_tf}')
# print(f'MAE (TensorFlow): {mae_tf}')
# print(f'R2 (TensorFlow): {r2_tf}')