Загрузка файлов.

In [None]:
!pip install gdown
!gdown --id 1Ab-fWMNXnaEmZITs0tocuYNf5BJreqtG
!unzip -q cu-advance-project.zip -d /content/data

Downloading...
From (original): https://drive.google.com/uc?id=1Ab-fWMNXnaEmZITs0tocuYNf5BJreqtG
From (redirected): https://drive.google.com/uc?id=1Ab-fWMNXnaEmZITs0tocuYNf5BJreqtG&confirm=t&uuid=b32e2210-8624-45f9-ae94-584e03f13ca7
To: /content/cu-advance-project.zip
100% 68.7M/68.7M [00:00<00:00, 234MB/s]
replace /content/data/submission_example.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/data/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/data/train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
import pandas as pd

train = pd.read_csv('/content/data/train.csv')
test = pd.read_csv('/content/data/test.csv')
submission_example = pd.read_csv('/content/data/submission_example.csv')

Основная информация

In [None]:
train_df = train
test_df = test
submission_df = submission_example

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Train columns:", train_df.columns.to_list())

Train shape: (53478, 28)
Test shape: (16712, 26)
Train columns: ['id', 'Release date', 'Required age', 'Price', 'DLC count', 'About the game', 'Supported languages', 'Full audio languages', 'Reviews', 'Header image', 'Windows', 'Mac', 'Linux', 'Achievements', 'Recommendations', 'Notes', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'Developers', 'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Positive', 'Negative']


TRAIN

Подготовка данных

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.utils.data import Dataset

In [None]:
df = train_df

# === Обработка ===
df["Release year"] = pd.to_datetime(df["Release date"], errors='coerce').dt.year.fillna(0).astype(int)
df["About the game"] = df["About the game"].fillna("")
df["Developers"] = df["Developers"].fillna("unknown")
df["Publishers"] = df["Publishers"].fillna("unknown")

# === TF-IDF по описанию ===
tfidf = TfidfVectorizer(max_features=256, stop_words='english')
about_tfidf = tfidf.fit_transform(df["About the game"]).toarray()

# === Числовые и бинарные признаки ===
numeric_features = ["Required age", "Price", "DLC count", "Achievements",
                    "Recommendations", "Average playtime forever",
                    "Average playtime two weeks", "Median playtime forever",
                    "Median playtime two weeks", "Release year"]

binary_features = ["Windows", "Mac", "Linux"]

X_numeric = df[numeric_features + binary_features].fillna(0).values

scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# === Финальный вектор признаков ===
X_full = np.hstack([X_numeric_scaled, about_tfidf])

# === Таргет ===
y = df[["Positive", "Negative"]].values.astype(np.float32)

# === Разделение ===
X_train, X_val, y_train, y_val = train_test_split(X_full, y, test_size=0.1, random_state=42)

# === Dataset ===
class SteamDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

# === Создание датасетов ===
train_dataset = SteamDataset(X_train, y_train)
val_dataset = SteamDataset(X_val, y_val)

Построение модели

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader

# === Модель ===
class SteamNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 2)  # 2 выхода: positive и negative
        )

    def forward(self, x):
        return self.net(x)

# === Подготовка к обучению ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

model = SteamNet(input_dim=X_train.shape[1]).to(device)
criterion = nn.L1Loss()  # MAE
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# === Обучение ===
EPOCHS = 15
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = criterion(pred, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * xb.size(0)

    # Валидация
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            val_loss += loss.item() * xb.size(0)

    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    print(f"[{epoch+1}/{EPOCHS}] Train MAE: {train_loss:.2f} | Val MAE: {val_loss:.2f}")

[1/15] Train MAE: 494.59 | Val MAE: 406.90
[2/15] Train MAE: 365.54 | Val MAE: 210.24
[3/15] Train MAE: 225.12 | Val MAE: 185.06
[4/15] Train MAE: 216.80 | Val MAE: 185.37
[5/15] Train MAE: 217.61 | Val MAE: 184.79
[6/15] Train MAE: 216.70 | Val MAE: 180.82
[7/15] Train MAE: 216.07 | Val MAE: 181.58
[8/15] Train MAE: 214.52 | Val MAE: 180.93
[9/15] Train MAE: 214.20 | Val MAE: 180.12
[10/15] Train MAE: 214.44 | Val MAE: 184.36
[11/15] Train MAE: 217.45 | Val MAE: 179.84
[12/15] Train MAE: 213.18 | Val MAE: 181.24
[13/15] Train MAE: 211.76 | Val MAE: 179.37
[14/15] Train MAE: 214.49 | Val MAE: 179.51
[15/15] Train MAE: 213.86 | Val MAE: 179.27


TEST

In [None]:
# === Загрузка test ===
test_df = test

# Те же преобразования, что и с train
test_df["Release year"] = pd.to_datetime(test_df["Release date"], errors='coerce').dt.year.fillna(0).astype(int)
test_df["About the game"] = test_df["About the game"].fillna("")
test_df["Developers"] = test_df["Developers"].fillna("unknown")
test_df["Publishers"] = test_df["Publishers"].fillna("unknown")

# TF-IDF: только transform, НЕ fit!
about_tfidf_test = tfidf.transform(test_df["About the game"]).toarray()

X_numeric_test = test_df[numeric_features + binary_features].fillna(0).values
X_numeric_test_scaled = scaler.transform(X_numeric_test)

X_test_full = np.hstack([X_numeric_test_scaled, about_tfidf_test])

# PyTorch dataset
test_dataset = SteamDataset(X_test_full)

# Предсказание
test_loader = DataLoader(test_dataset, batch_size=32)
model.eval()
all_preds = []

with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(device)
        preds = model(xb).cpu().numpy()
        all_preds.append(preds)

all_preds = np.vstack(all_preds)

# === Submission ===
submission = submission_df
submission[["Positive", "Negative"]] = all_preds
submission.to_csv("submission_the_lastest.csv", index=False)

print("Submission.csv создан!")

Submission.csv создан!
