<a href="https://colab.research.google.com/github/cerebraters/predict_V2/blob/main/TESTpredict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install catboost
!wget https://raw.githubusercontent.com/cerebraters/predict_V2/refs/heads/main/ai.json
!wget https://raw.githubusercontent.com/cerebraters/predict_V2/refs/heads/main/output.json

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
--2025-07-09 23:47:57--  https://raw.githubusercontent.com/cerebraters/predict_V2/refs/heads/main/ai.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26197605 (25M) [text/plain]
Saving to: ‘ai.json’


2025-07-09 23:47:58 (85.0 MB/s) - ‘ai.json’ saved [26197605/26197605]

--2025-07-09 23:47:58--  https://raw.githubusercontent.com/cerebraters/predict_V2/refs/heads/main/output.json
Re

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    mean_absolute_error,
    r2_score,
    accuracy_score,
    f1_score,
    top_k_accuracy_score
)
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
import joblib
import os
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List, Dict, Any, Optional, Union
from tqdm import tqdm

class PurchaseCycleAnalyzer:
    def __init__(self):
        self.item_cycles = defaultdict(dict)
        self.category_cycles = defaultdict(dict)

    def update_cycles(self, df: pd.DataFrame):

        print("\nAnalyzing purchase cycles...")

        for item, group in tqdm(df.groupby('goodsName'), desc="Items"):
            day_counts = group['day_of_week'].value_counts(normalize=True)
            for day, freq in day_counts.items():
                if freq > 0.25:
                    self.item_cycles[item][day] = freq

        for category, group in tqdm(df.groupby('categoryName'), desc="Categories"):
            day_counts = group['day_of_week'].value_counts(normalize=True)
            for day, freq in day_counts.items():
                if freq > 0.3:
                    self.category_cycles[category][day] = freq

    def get_cycle_weight(self, item: str, category: str, day_of_week: int) -> float:

        item_weight = self.item_cycles.get(item, {}).get(day_of_week, 1.0)
        category_weight = self.category_cycles.get(category, {}).get(day_of_week, 1.0)
        return (item_weight + category_weight) / 2

class TimeFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features = ['hour', 'day_of_week', 'month']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.features)

        result = np.zeros((X.shape[0], 6))
        result[:, 0] = np.sin(2 * np.pi * X['hour']/24)
        result[:, 1] = np.cos(2 * np.pi * X['hour']/24)
        result[:, 2] = np.sin(2 * np.pi * X['day_of_week']/7)
        result[:, 3] = np.cos(2 * np.pi * X['day_of_week']/7)
        result[:, 4] = np.sin(2 * np.pi * (X['month']-1)/12)
        result[:, 5] = np.cos(2 * np.pi * (X['month']-1)/12)

        return result

class PlaceHistoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, user_history: Dict[str, List[Dict]], n_last_visits: int = 5):
        self.user_history = user_history
        self.n_last_visits = n_last_visits
        self.place_to_idx = {}
        self.num_places = 0

    def fit(self, X, y=None):
        all_places = set()
        for visits in self.user_history.values():
            for visit in visits[:self.n_last_visits]:
                all_places.add(visit['place'])

        self.place_to_idx = {place: idx for idx, place in enumerate(sorted(all_places))}
        self.num_places = len(self.place_to_idx)
        return self

    def transform(self, X):
        features = np.zeros((X.shape[0], self.num_places))

        for i, row in enumerate(X['customerId']):
            last_places = [visit['place'] for visit in
                         sorted(self.user_history.get(row, []),
                         key=lambda x: x['datetime'], reverse=True)[:self.n_last_visits]]

            for place in last_places:
                if place in self.place_to_idx:
                    features[i, self.place_to_idx[place]] += 1

        return features

class CustomerBehaviorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, user_history: Dict[str, List[Dict]]):
        self.user_history = user_history
        self.user_stats = {}

    def fit(self, X, y=None):
        print("\nCalculating customer behavior statistics...")
        for user_id, history in tqdm(self.user_history.items(), desc="Users"):
            stats = {}
            if history:
                hours = [h['datetime'].hour for h in history]
                stats['avg_hour'] = np.mean(hours) if hours else 12
                stats['hour_std'] = np.std(hours) if len(hours) > 1 else 0

                stats['total_visits'] = len(history)
                time_diff = (history[-1]['datetime'] - history[0]['datetime']).days
                stats['visit_freq'] = stats['total_visits'] / (time_diff + 1) if time_diff >= 0 else 0

                categories = [h.get('categoryName', '') for h in history]
                if categories:
                    stats['fav_category'] = max(set(categories), key=categories.count)
                else:
                    stats['fav_category'] = 'unknown'

            else:
                stats = {
                    'avg_hour': 12,
                    'hour_std': 0,
                    'total_visits': 0,
                    'visit_freq': 0,
                    'fav_category': 'unknown'
                }

            self.user_stats[user_id] = stats
        return self

    def transform(self, X):
        features = np.zeros((X.shape[0], 5))

        for i, (_, row) in enumerate(X.iterrows()):
            user_id = row['customerId']
            stats = self.user_stats.get(user_id, {})

            features[i, 0] = abs(row['hour'] - stats.get('avg_hour', 12)) / 24

            features[i, 1] = stats.get('hour_std', 0) / 12

            features[i, 2] = 1 if stats.get('fav_category', '') == row['categoryName'] else 0

            features[i, 3] = np.log1p(stats.get('total_visits', 0))

            features[i, 4] = stats.get('visit_freq', 0)

        return features

class PersonalItemFeatures(BaseEstimator, TransformerMixin):

    def __init__(self, user_history: Dict[str, List[Dict]]):
        self.user_history = user_history
        self.user_categories = defaultdict(set)  # Используем set для быстрого поиска
        self.co_occurrence = defaultdict(int)
        self._is_fitted = False

    def fit(self, X, y=None):
        print("\nExtracting personal item features...")
        for user_id, history in self.user_history.items():
            categories = {item.get('category', '').split('|')[0] for item in history if '|' in item.get('category', '')}
            self.user_categories[user_id].update(categories)

        try:
            for user_id, history in self.user_history.items():
                items = [item['item'] for item in history if 'item' in item]
                for i in range(len(items)-1):
                    pair = tuple(sorted([items[i], items[i+1]]))
                    self.co_occurrence[pair] += 1
        except Exception as e:
            print(f"Warning: co-occurrence extraction failed: {str(e)}")

        self._is_fitted = True
        return self

    def transform(self, X):
        if not self._is_fitted:
            raise RuntimeError("Transformer must be fitted first")

        n_samples = X.shape[0] if hasattr(X, 'shape') else len(X)
        features = np.zeros((n_samples, 3))

        for i in range(n_samples):
            try:
                row = X.iloc[i] if hasattr(X, 'iloc') else X[i]
                user_id = str(row['customerId'])
                current_category = str(row.get('categoryName', ''))

                features[i, 0] = 1 if current_category in self.user_categories.get(user_id, set()) else 0

                if user_id in self.user_history:
                    features[i, 1] = sum(
                        1 for item in self.user_history[user_id]
                        if str(item.get('category', '')).startswith(current_category)
                    )

                if hasattr(row, 'goodsName'):
                    item_name = str(row['goodsName'])
                    features[i, 2] = sum(
                        self.co_occurrence.get(tuple(sorted([item_name, hist_item.get('item', '')])), 0)
                        for hist_item in self.user_history.get(user_id, [])
                    )

            except Exception as e:
                print(f"Warning: error processing sample {i}: {str(e)}")
                features[i, :] = 0

        return features

class TimePredictor:
    def __init__(self):
        self.model = CatBoostRegressor(
            iterations=150,
            depth=8,
            learning_rate=0.1,
            loss_function='MAE',
            verbose=100,
            task_type='CPU'
        )

        self.preprocessor = ColumnTransformer([
            ('customer', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['customerId']),
            ('place', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['pointName']),
            ('time', TimeFeaturesTransformer(), ['hour', 'day_of_week', 'month'])
        ])

        self.pipeline = Pipeline([
            ('preprocessor', self.preprocessor),
            ('regressor', self.model)
        ])

    def train(self, df: pd.DataFrame):
        X = df[['customerId', 'hour', 'day_of_week', 'month', 'pointName']]
        y = df['hour']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        print("\nTraining Time Model...")
        self.pipeline.fit(X_train, y_train)

        y_pred = self.pipeline.predict(X_test)
        print(f"\nTime Model Metrics:")
        print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f} hours")
        print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

    def save(self, path: str):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(self, path)
        print(f"Time model saved to {path}")

class EnhancedPlacePredictor:
    def __init__(self, user_history: Dict[str, List[Dict]]):
        self.user_history = user_history
        self.model = self._init_model()
        self.preprocessor = self._init_preprocessor()

    def _init_model(self):
        return CatBoostClassifier(
            iterations=2000,
            depth=10,
            learning_rate=0.05,
            auto_class_weights='Balanced',
            verbose=100,
            task_type='CPU',
            eval_metric='TotalF1:average=Macro',
            early_stopping_rounds=200
        )

    def _init_preprocessor(self):
        return ColumnTransformer([
            ('customer', OneHotEncoder(
                handle_unknown='ignore',
                sparse_output=False,
                max_categories=7000),
             ['customerId']),
            ('time', TimeFeaturesTransformer(), ['hour', 'day_of_week', 'month']),
            ('time_of_day', OneHotEncoder(
                handle_unknown='ignore',
                sparse_output=False),
             ['time_of_day']),
            ('history', PlaceHistoryEncoder(
                user_history=self.user_history,
                n_last_visits=5),
             ['customerId', 'hour', 'day_of_week', 'month'])
        ])

    def train(self, df: pd.DataFrame):
        df['time_of_day'] = pd.cut(
            df['hour'],
            bins=[0, 6, 12, 18, 24],
            labels=['night', 'morning', 'afternoon', 'evening'],
            right=False
        )
        place_counts = df['pointName'].value_counts()
        valid_places = place_counts[place_counts >= 5].index
        df = df[df['pointName'].isin(valid_places)]

        print(f"\nTraining on {len(df)} samples with {len(valid_places)} unique places")

        X = df[['customerId', 'hour', 'day_of_week', 'month', 'time_of_day']]
        y = df['pointName']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        print("\nTraining Place Model...")
        self.model.fit(
            self.preprocessor.fit_transform(X_train, y_train),
            y_train,
            eval_set=(self.preprocessor.transform(X_test), y_test),
            plot=True
        )

        self._evaluate(X_test, y_test)

    def _evaluate(self, X_test, y_test):
        print("\nPlace Model Evaluation:")
        X_test_processed = self.preprocessor.transform(X_test)
        y_pred = self.model.predict(X_test_processed)
        y_proba = self.model.predict_proba(X_test_processed)

        print(classification_report(y_test, y_pred, zero_division=0))

        for k in [1, 3, 5]:
            top_k_acc = np.mean([
                y_test.iloc[i] in y_proba[i].argsort()[-k:]
                for i in range(len(y_test))
            ])
            print(f"Top-{k} Accuracy: {top_k_acc:.4f}")

        self._plot_confusion_matrix(y_test, y_pred)

    def _plot_confusion_matrix(self, y_test, y_pred):
        top_places = y_test.value_counts().nlargest(15).index
        cm = confusion_matrix(
            y_test[y_test.isin(top_places)],
            y_pred[y_test.isin(top_places)],
            labels=top_places,
            normalize='true'
        )

        plt.figure(figsize=(15, 12))
        sns.heatmap(
            cm,
            annot=True,
            fmt='.2f',
            xticklabels=top_places,
            yticklabels=top_places,
            cmap='Blues'
        )
        plt.title('Confusion Matrix for Top 15 Places')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('place_confusion_matrix.png', dpi=300)
        plt.close()

    def save(self, path: str):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(self, path)
        print(f"Enhanced place model saved to {path}")

class EnhancedItemRecommender:
    def __init__(self, user_history: Dict[str, List[Dict]], top_k: int = 5):
        self.top_k = top_k
        self.user_history = user_history
        self.cycle_analyzer = PurchaseCycleAnalyzer()
        self.model = CatBoostClassifier(
            iterations=150,
            depth=5,
            learning_rate=0.05,
            loss_function='MultiClass',
            verbose=10,
            task_type='CPU',
            eval_metric='TotalF1:average=Macro',
            early_stopping_rounds=50
        )

        self.preprocessor = ColumnTransformer([
            ('customer', OneHotEncoder(handle_unknown='ignore', sparse_output=False, max_categories=7000), ['customerId']),
            ('category', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['categoryName']),
            ('time', TimeFeaturesTransformer(), ['hour', 'day_of_week', 'month']),
            ('behavior', CustomerBehaviorTransformer(user_history=self.user_history), ['customerId', 'hour', 'categoryName']),
            ('personal', PersonalItemFeatures(user_history=self.user_history), ['customerId', 'categoryName'])
        ])

    def train(self, df: pd.DataFrame):
        self.cycle_analyzer.update_cycles(df)

        df = self._filter_data(df)

        X = df[['customerId', 'hour', 'day_of_week', 'month', 'categoryName']]
        y = df['goodsName']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        print("\nTraining Item Model...")
        self.model.fit(
            self.preprocessor.fit_transform(X_train, y_train),
            y_train,
            eval_set=(self.preprocessor.transform(X_test), y_test),
            plot=True
        )

        self._evaluate(X_test, y_test)

    def _filter_data(self, df):

        mask = ~df['goodsName'].str.contains('пакет', case=False, na=False)
        df = df[mask]

        mask = ~df['categoryName'].str.contains('пакет', case=False, na=False)
        df = df[mask]

        item_counts = df['goodsName'].value_counts()
        valid_items = item_counts[item_counts >= 5].index
        return df[df['goodsName'].isin(valid_items)]

    def _evaluate(self, X_test, y_test):
        print("\nItem Model Evaluation:")
        X_test_processed = self.preprocessor.transform(X_test)
        y_pred = self.model.predict(X_test_processed)
        y_proba = self.model.predict_proba(X_test_processed)

        print("\nOverall Metrics:")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print(f"Top-1 Accuracy: {top_k_accuracy_score(y_test, y_proba, k=1):.4f}")
        print(f"Top-3 Accuracy: {top_k_accuracy_score(y_test, y_proba, k=3):.4f}")
        print(f"Top-5 Accuracy: {top_k_accuracy_score(y_test, y_proba, k=5):.4f}")

        self._plot_confusion_matrix(y_test, y_pred)

    def _plot_confusion_matrix(self, y_test, y_pred):
        top_n = 10
        top_classes = y_test.value_counts().nlargest(top_n).index
        mask = y_test.isin(top_classes)

        if sum(mask) == 0:
            print("No samples available for confusion matrix")
            return

        cm = confusion_matrix(
            y_test[mask],
            y_pred[mask],
            labels=top_classes,
            normalize='true'
        )

        plt.figure(figsize=(12, 10))
        sns.heatmap(
            cm,
            annot=True,
            fmt='.2f',
            xticklabels=top_classes,
            yticklabels=top_classes,
            cmap='Blues'
        )
        plt.title(f'Confusion Matrix for Top {top_n} Items')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('item_confusion_matrix.png', dpi=300)
        plt.close()

    def predict_top_items(self, X: pd.DataFrame, day_of_week: int) -> List[List[str]]:
        X_processed = self.preprocessor.transform(X)
        probs = self.model.predict_proba(X_processed)

        for i, (item, category) in enumerate(zip(X['goodsName'], X['categoryName'])):
            for j, class_name in enumerate(self.model.classes_):
                weight = self.cycle_analyzer.get_cycle_weight(
                    item=class_name,
                    category=category,
                    day_of_week=day_of_week
                )
                probs[i, j] *= weight

        probs = probs / probs.sum(axis=1, keepdims=True)
        top_indices = np.argsort(-probs, axis=1)[:, :self.top_k]
        return [[str(self.model.classes_[i]) for i in row] for row in top_indices]

    def save(self, path: str):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(self, path)
        print(f"Enhanced item model saved to {path}")

class PurchasePredictor:
    def __init__(self, top_k_items: int = 5):
        self.time_model = TimePredictor()
        self.place_model = None
        self.item_model = None
        self.user_history = defaultdict(list)
        self.general_stats = {
            'top_items': [],
            'common_place': "",
            'common_hour': 12,
            'common_hour_window': "12:00-14:00",
            'total_users': 0,
            'total_transactions': 0
        }

    def load_data(self, json_data: List[Dict]):
        df = pd.DataFrame(json_data)

        required_cols = ['customerId', 'timeCheck', 'pointName', 'goodsName', 'categoryName']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")

        df['datetime'] = pd.to_datetime(df['timeCheck'])
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek
        df['month'] = df['datetime'].dt.month

        self.general_stats.update({
            'top_items': df['goodsName'].value_counts().head(10).index.tolist(),
            'common_place': df['pointName'].mode()[0] if not df['pointName'].empty else "unknown",
            'common_hour': df['hour'].mode()[0] if not df['hour'].empty else 12,
            'common_hour_window': f"{self.general_stats['common_hour']:02d}:00-{(self.general_stats['common_hour']+2)%24:02d}:00",
            'total_users': df['customerId'].nunique(),
            'total_transactions': len(df)
        })

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Loading data"):
            self.user_history[str(row['customerId'])].append({
                'datetime': row['datetime'],
                'item': row['goodsName'],
                'place': row['pointName'],
                'category': row['categoryName']
            })

        self.place_model = EnhancedPlacePredictor(self.user_history)
        self.item_model = EnhancedItemRecommender(self.user_history, top_k=5)

        return df

    def train(self, df: pd.DataFrame):
        print("\nStarting model training...")
        print(f"Dataset stats: {self.general_stats}")

        print("\n=== Training Item Model ===")
        self.item_model.train(df)

        print("\n=== Training Time Model ===")
        self.time_model.train(df)

        print("\n=== Training Place Model ===")
        self.place_model.train(df)

        print("\nAll models trained successfully!")

    def predict_for_user(self, user_id: str, current_date: Optional[datetime] = None) -> Dict[str, Any]:
        current_date = current_date or datetime.now()
        user_id = str(user_id)

        result = {
            'customerId': user_id,
            'predicted_time': self.general_stats['common_hour_window'],
            'time_confidence': 0.5,
            'predicted_place': self.general_stats['common_place'],
            'place_confidence': 0.5,
            'top_items': self.general_stats['top_items'][:self.item_model.top_k],
            'items_confidence': [0.5] * self.item_model.top_k,
            'model_version': 'enhanced-v1.0',
            'generated_at': datetime.now().isoformat(),
            'user_history_count': len(self.user_history.get(user_id, []))
        }

        try:

            time_pred = self._predict_time(user_id, current_date)
            result.update(time_pred)

            place_pred = self._predict_place(user_id, current_date, result['predicted_hour'])
            result.update(place_pred)

            item_pred = self._predict_items(user_id, current_date,
                                         result['predicted_hour'],
                                         result['predicted_place'])
            result.update(item_pred)

        except Exception as e:
            print(f"\nPrediction error for user {user_id}: {str(e)}")
            result['error'] = str(e)

        return result

    def _predict_time(self, user_id: str, current_date: datetime) -> Dict[str, Any]:
        last_visits = sorted(
            self.user_history.get(user_id, []),
            key=lambda x: x['datetime'],
            reverse=True
        )[:3]

        if not last_visits:
            return {
                'predicted_hour': self.general_stats['common_hour'],
                'predicted_time': self.general_stats['common_hour_window'],
                'time_confidence': 0.7
            }

        weighted_hour = sum(
            visit['datetime'].hour * (0.6 ** i)
            for i, visit in enumerate(last_visits)
        )
        sum_weights = sum(0.6 ** i for i in range(len(last_visits)))
        avg_hour = round(weighted_hour / sum_weights)

        time_input = pd.DataFrame([{
            'customerId': user_id,
            'hour': avg_hour,
            'day_of_week': current_date.weekday(),
            'month': current_date.month,
            'pointName': last_visits[0]['place']
        }])

        predicted_hour = int(round(self.time_model.pipeline.predict(time_input)[0]))
        predicted_hour = max(8, min(22, predicted_hour))

        return {
            'predicted_hour': predicted_hour,
            'predicted_time': f"{predicted_hour:02d}:00-{(predicted_hour+2)%24:02d}:00",
            'time_confidence': 1.0 - min(1.0, abs(self.time_model.pipeline.predict(time_input)[0] - predicted_hour))
        }

    def _predict_place(self, user_id: str, current_date: datetime, hour: int) -> Dict[str, Any]:
        default_result = {
            'predicted_place': self.general_stats['common_place'],
            'place_confidence': 0.5
        }

        try:
            place_input = pd.DataFrame([{
                'customerId': user_id,
                'hour': hour,
                'day_of_week': current_date.weekday(),
                'month': current_date.month,
                'time_of_day': self._get_time_of_day(hour)
            }])

            if not hasattr(self.place_model, 'model') or not hasattr(self.place_model, 'preprocessor'):
                print(f"Place model not properly initialized for user {user_id}")
                return default_result

            X_processed = self.place_model.preprocessor.transform(place_input)

            if not hasattr(self.place_model.model, 'classes_'):
                print(f"Place model not trained for user {user_id}")
                return default_result

            place = self.place_model.model.predict(X_processed)[0]
            place_proba = np.max(self.place_model.model.predict_proba(X_processed))

            return {
                'predicted_place': str(place),
                'place_confidence': float(place_proba)
            }

        except Exception as e:
            print(f"Place prediction error for user {user_id}: {str(e)}")
            return default_result

    def predict_for_user(self, user_id: str, current_date: Optional[datetime] = None) -> Dict[str, Any]:
        current_date = current_date or datetime.now()
        user_id = str(user_id)

        result = {
            'customerId': user_id,
            'predicted_time': self.general_stats['common_hour_window'],
            'time_confidence': 0.5,
            'predicted_place': self.general_stats['common_place'],
            'place_confidence': 0.5,
            'top_items': self.general_stats['top_items'][:self.item_model.top_k],
            'items_confidence': [0.5] * self.item_model.top_k,
            'model_version': 'enhanced-v1.0',
            'generated_at': datetime.now().isoformat(),
            'user_history_count': len(self.user_history.get(user_id, []))
        }

        try:
            time_pred = self._predict_time(user_id, current_date)
            result.update(time_pred)

            place_pred = self._predict_place(user_id, current_date, result['predicted_hour'])
            result.update(place_pred)

            if result['user_history_count'] > 0:
                item_pred = self._predict_items(user_id, current_date,
                                            result['predicted_hour'],
                                            result['predicted_place'])
                result.update(item_pred)

        except Exception as e:
            print(f"\nPrediction error for user {user_id}: {str(e)}")
            result['error'] = str(e)

        return result

    def _predict_items(self, user_id: str, current_date: datetime, hour: int, place: str) -> Dict[str, Any]:
        user_history = self.user_history.get(user_id, [])
        if not user_history:
            return {
                'top_items': self.general_stats['top_items'][:self.item_model.top_k],
                'items_confidence': [0.5] * self.item_model.top_k
            }

        categories = []
        for item in user_history:
            try:
                if 'category' in item:
                    categories.append(item['category'])
                elif 'categoryName' in item:
                    categories.append(item['categoryName'])
            except:
                continue

        fav_category = max(set(categories), key=categories.count) if categories else 'unknown'

        item_input = pd.DataFrame([{
            'customerId': user_id,
            'hour': hour,
            'day_of_week': current_date.weekday(),
            'month': current_date.month,
            'categoryName': fav_category,
            'goodsName': 'unknown'
        }])

        try:
            top_items = self.item_model.predict_top_items(
                item_input,
                current_date.weekday()
            )[0]

            X_processed = self.item_model.preprocessor.transform(item_input)
            all_probs = self.item_model.model.predict_proba(X_processed)[0]

            item_indices = [
                np.where(self.item_model.model.classes_ == item)[0][0]
                for item in top_items
                if item in self.item_model.model.classes_
            ]
            item_confs = [float(all_probs[idx]) for idx in item_indices[:self.item_model.top_k]]

            return {
                'top_items': top_items[:self.item_model.top_k],
                'items_confidence': item_confs
            }
        except Exception as e:
            print(f"Item prediction error for user {user_id}: {str(e)}")
            return {
                'top_items': self.general_stats['top_items'][:self.item_model.top_k],
                'items_confidence': [0.5] * self.item_model.top_k
            }

    def _get_time_of_day(self, hour: int) -> str:
        if 0 <= hour < 6:
            return 'night'
        elif 6 <= hour < 12:
            return 'morning'
        elif 12 <= hour < 18:
            return 'afternoon'
        else:
            return 'evening'

    def save_models(self, model_dir: str = "models"):
        os.makedirs(model_dir, exist_ok=True)

        print("\nSaving models...")
        self.time_model.save(f"{model_dir}/time_model.joblib")
        self.place_model.save(f"{model_dir}/place_model.joblib")
        self.item_model.save(f"{model_dir}/item_model.joblib")

        print(f"\nAll models saved to {model_dir} directory")

    @classmethod
    def load_models(cls, model_dir: str = "models"):
        predictor = cls()

        print("\nLoading models...")
        predictor.time_model = joblib.load(f"{model_dir}/time_model.joblib")
        predictor.place_model = joblib.load(f"{model_dir}/place_model.joblib")
        predictor.item_model = joblib.load(f"{model_dir}/item_model.joblib")

        print(f"\nAll models loaded from {model_dir} directory")
        return predictor

def generate_reports(predictions: List[Dict], output_dir: str):
    os.makedirs(output_dir, exist_ok=True)

    with open(f"{output_dir}/predictions.json", 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)

    df_pred = pd.DataFrame(predictions)

    try:
        item_counts = pd.Series([
            item for sublist in df_pred['top_items']
            for item in sublist
        ]).value_counts()

        plt.figure(figsize=(15, 8))
        item_counts.head(20).plot(kind='bar')
        plt.title("Top 20 Recommended Items")
        plt.ylabel("Recommendation Count")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(f"{output_dir}/top_recommended_items.png", dpi=300)
        plt.close()
    except Exception as e:
        print(f"Error generating items plot: {str(e)}")

    try:
        plt.figure(figsize=(12, 6))
        for metric in ['time_confidence', 'place_confidence']:
            if metric in df_pred.columns:
                sns.kdeplot(df_pred[metric], label=metric.replace('_', ' ').title())

        if 'items_confidence' in df_pred.columns:
            avg_item_conf = df_pred['items_confidence'].apply(
                lambda x: np.mean(x) if isinstance(x, list) else x
            )
            sns.kdeplot(avg_item_conf, label='Avg Items Confidence')

        plt.title("Confidence Scores Distribution")
        plt.xlabel("Confidence Score")
        plt.legend()
        plt.savefig(f"{output_dir}/confidence_distribution.png", dpi=300)
        plt.close()
    except Exception as e:
        print(f"Error generating confidence plot: {str(e)}")

    metrics = {
        'statistics': {
            'total_users': len(predictions),
            'users_with_history': sum(
                1 for p in predictions
                if p['user_history_count'] > 0
            ),
            'avg_history_length': np.mean([
                p['user_history_count']
                for p in predictions
            ]),
            'generated_at': datetime.now().isoformat()
        },
        'confidence_metrics': {}
    }

    for metric in ['time_confidence', 'place_confidence']:
        if metric in df_pred.columns:
            metrics['confidence_metrics'][metric] = {
                'mean': float(df_pred[metric].mean()),
                'median': float(df_pred[metric].median()),
                'std': float(df_pred[metric].std())
            }

    if 'items_confidence' in df_pred.columns:
        avg_item_conf = df_pred['items_confidence'].apply(
            lambda x: np.mean(x) if isinstance(x, list) else x
        )
        metrics['confidence_metrics']['items_confidence'] = {
            'mean': float(avg_item_conf.mean()),
            'median': float(avg_item_conf.median()),
            'std': float(avg_item_conf.std())
        }

    with open(f"{output_dir}/metrics.json", 'w') as f:
        json.dump(metrics, f, indent=2)

    print(f"\nReports generated in {output_dir}")

def main(input_path: str = "ai.json",
         output_dir: str = "output",
         model_dir: str = "models",
         ):

    print("Loading data...")
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print("\nInitializing predictor...")
    predictor = PurchasePredictor(top_k_items=5)
    df = predictor.load_data(data)

    print(f"\nData Statistics:")
    print(f"- Total transactions: {len(df):,}")
    print(f"- Unique users: {df['customerId'].nunique():,}")
    print(f"- Unique places: {df['pointName'].nunique():,}")
    print(f"- Unique items: {df['goodsName'].nunique():,}")
    print(f"- Time range: {df['datetime'].min()} to {df['datetime'].max()}")

    predictor.train(df)

    predictor.save_models(model_dir)

    print("\nGenerating predictions...")

    user_ids = df['customerId'].astype(str).unique()
    predictions = []

    for uid in tqdm(user_ids, desc="Predicting"):
        try:
            pred = predictor.predict_for_user(uid)
            predictions.append(pred)
        except Exception as e:
            print(f"\nError predicting for user {uid}: {str(e)}")
            continue

    if predictions:
        generate_reports(predictions, output_dir)

        print("\nSample predictions:")
        for pred in predictions[:3]:
            print(f"\nCustomer {pred['customerId']}:")
            print(f"- Time: {pred['predicted_time']} (confidence: {pred['time_confidence']:.2f})")
            print(f"- Place: {pred['predicted_place']} (confidence: {pred['place_confidence']:.2f})")
            print("- Top items:")
            for item, conf in zip(pred['top_items'], pred['items_confidence']):
                print(f"  • {item} (confidence: {conf:.2f})")
            print(f"- History: {pred['user_history_count']} past purchases")
    else:
        print("\nNo predictions generated due to errors")

    return predictions



if __name__ == "__main__":
    predictions = main(
        input_path="ai.json",
        output_dir="output",
        model_dir="models"
    )

Loading data...

Initializing predictor...


Loading data: 100%|██████████| 40000/40000 [00:02<00:00, 14448.82it/s]



Data Statistics:
- Total transactions: 40,000
- Unique users: 6,094
- Unique places: 67
- Unique items: 813
- Time range: 2025-06-17 19:42:12 to 2025-06-20 11:55:53

Starting model training...
Dataset stats: {'top_items': ['Пакет  фирменный  майка  Новый  дизайн,  черный', 'Пакет  фирменный  майка', 'ПЭТ  1,5л  (1/60)  Полимер', 'Контейнер  500  мл  РКСП  ОП  (137*132*62)  480шт', 'Пиво  разливное  АЯН  Абаканское  светлое  /50л', 'ПЭТ  3,0л  Акция    (1/35)  Полимер', 'ПЭТ  1,5л  (1/104)(УУ)', 'ПЭТ  1,5л  (1/50)  (И)', 'ПЭТ  1,0л  (1/80)  Полимер', 'ПЭТ  1,5л  (1/120)(АН)'], 'common_place': '60 лет Образования СССР, д. 21/2', 'common_hour': np.int32(20), 'common_hour_window': '12:00-14:00', 'total_users': 6094, 'total_transactions': 40000}

=== Training Item Model ===

Analyzing purchase cycles...


Items: 100%|██████████| 813/813 [00:00<00:00, 3248.30it/s]
Categories: 100%|██████████| 81/81 [00:00<00:00, 2079.01it/s]



Training Item Model...

Calculating customer behavior statistics...


Users: 100%|██████████| 6094/6094 [00:00<00:00, 24731.48it/s]



Extracting personal item features...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.0040726	test: 0.0040573	best: 0.0040573 (0)	total: 6.53s	remaining: 16m 13s
10:	learn: 0.0156043	test: 0.0154333	best: 0.0154333 (9)	total: 1m 16s	remaining: 16m 7s
20:	learn: 0.0203596	test: 0.0202012	best: 0.0202175 (17)	total: 2m 25s	remaining: 14m 51s
30:	learn: 0.0235675	test: 0.0234524	best: 0.0234524 (30)	total: 3m 36s	remaining: 13m 50s
40:	learn: 0.0270804	test: 0.0267586	best: 0.0267586 (38)	total: 4m 46s	remaining: 12m 41s
50:	learn: 0.0294323	test: 0.0289447	best: 0.0289839 (47)	total: 5m 56s	remaining: 11m 31s
60:	learn: 0.0313199	test: 0.0308647	best: 0.0308647 (56)	total: 7m 5s	remaining: 10m 20s
70:	learn: 0.0348565	test: 0.0338860	best: 0.0338860 (69)	total: 8m 14s	remaining: 9m 10s
80:	learn: 0.0359810	test: 0.0347050	best: 0.0347050 (80)	total: 9m 23s	remaining: 7m 59s
90:	learn: 0.0364776	test: 0.0346158	best: 0.0347815 (81)	total: 10m 32s	remaining: 6m 49s
100:	learn: 0.0368911	test: 0.0350205	best: 0.0350205 (100)	total: 11m 42s	remaining: 5m 40s
110:	

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.0166371	test: 0.0139801	best: 0.0139801 (0)	total: 1.93s	remaining: 25m 42s
100:	learn: 0.0835172	test: 0.0714140	best: 0.0714140 (100)	total: 3m 16s	remaining: 22m 38s
200:	learn: 0.1182710	test: 0.0980892	best: 0.0988579 (196)	total: 6m 23s	remaining: 19m 2s
300:	learn: 0.1622723	test: 0.1330840	best: 0.1330840 (300)	total: 9m 33s	remaining: 15m 49s
400:	learn: 0.2002745	test: 0.1640469	best: 0.1640469 (397)	total: 12m 42s	remaining: 12m 39s
500:	learn: 0.2301258	test: 0.1902181	best: 0.1902181 (500)	total: 15m 57s	remaining: 9m 31s
600:	learn: 0.2581241	test: 0.2156247	best: 0.2157216 (599)	total: 19m 19s	remaining: 6m 23s
700:	learn: 0.2749920	test: 0.2293645	best: 0.2293645 (700)	total: 22m 40s	remaining: 3m 12s
799:	learn: 0.2913781	test: 0.2439316	best: 0.2439316 (797)	total: 25m 53s	remaining: 0us

bestTest = 0.243931601
bestIteration = 797

Shrink model to first 798 iterations.

Place Model Evaluation:
                                              precision    reca

Predicting: 100%|██████████| 6094/6094 [11:46<00:00,  8.62it/s]



Reports generated in output

Sample predictions:

Customer 1327337:
- Time: 12:00-14:00 (confidence: 0.51)
- Place: ['Красномосковская, д. 1а'] (confidence: 0.06)
- Top items:
  • Пиво  разливное  АЯН  Абаканское  светлое  /50л (confidence: 0.16)
  • Пиво  разливное  Минусинское  Жигулевское  /50  л (confidence: 0.05)
  • Пиво  разливное  Адмирал  Колчак  светлое  (Фирменное)  /50л (confidence: 0.05)
  • Пиво  разливное  Венское/30л  (МПЗ) (confidence: 0.04)
  • Пиво  разливное  Бочкарев  Чешское  светлое/50л (confidence: 0.04)
- History: 2 past purchases

Customer 1717391:
- Time: 12:00-14:00 (confidence: 1.00)
- Place: ['Красномосковская, д. 1а'] (confidence: 0.06)
- Top items:
  • Сигареты  Camel  Compact (confidence: 0.03)
  • Сигареты  Chesterfield  Selection  Compact  MT (confidence: 0.02)
  • Пиво  "АЯН"  светлое  4,8%  ст.  0,5л  (20шт) (confidence: 0.01)
  • Пиво  разливное  АЯН  Абаканское  светлое  /50л (confidence: 0.01)
  • ПЭТ  2,0л  Акция  (1/50)  (И) (confidence: 0.01)

In [4]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm
import os
from typing import Dict, List, Any

class EnhancedJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (pd.Timestamp, datetime)):
            return obj.isoformat()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super().default(obj)

class PredictionComparator:
    def __init__(self):
        self.comparison_results = []
        self.summary_stats = {
            'time': defaultdict(int),
            'place': defaultdict(int),
            'items': defaultdict(int),
            'users': defaultdict(int)
        }
        self.user_comparisons = []

    def _clean_data(self, data: Any) -> Any:
        if isinstance(data, (pd.Timestamp, datetime)):
            return data.isoformat()
        elif isinstance(data, dict):
            return {k: self._clean_data(v) for k, v in data.items()}
        elif isinstance(data, (list, tuple)):
            return [self._clean_data(item) for item in data]
        elif isinstance(data, (np.integer, np.floating)):
            return int(data) if isinstance(data, np.integer) else float(data)
        elif isinstance(data, np.ndarray):
            return data.tolist()
        return data

    def load_data(self, predictions_path: str, actual_path: str):
        try:
            with open(predictions_path, 'r', encoding='utf-8') as f:
                self.predictions = json.load(f)

            if actual_path.endswith('.json'):
                with open(actual_path, 'r', encoding='utf-8') as f:
                    actual_data = json.load(f)
            elif actual_path.endswith('.parquet'):
                actual_data = pd.read_parquet(actual_path).to_dict('records')

            self.actual_df = pd.DataFrame(actual_data)
            self.actual_df['datetime'] = pd.to_datetime(self.actual_df['timeCheck'])
            self.actual_df['hour'] = self.actual_df['datetime'].dt.hour
            self.actual_df['customerId'] = self.actual_df['customerId'].astype(str)

            self.predictions = self._clean_data(self.predictions)
            self.actual_data = self._clean_data(actual_data)

            self.pred_dict = {p['customerId']: p for p in self.predictions}
            self.actual_dict = defaultdict(list)

            for _, row in self.actual_df.iterrows():
                row_dict = row.to_dict()
                row_dict = self._clean_data(row_dict)
                self.actual_dict[row['customerId']].append(row_dict)

        except Exception as e:
            raise ValueError(f"Error loading data: {str(e)}")

    def save_results(self, output_dir: str = "comparison_results"):
        os.makedirs(output_dir, exist_ok=True)

        results = {
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'num_users': len(self.user_comparisons),
                'num_visits': len(self.comparison_results)
            },
            'summary_stats': self.summary_stats,
            'user_comparisons_sample': self.user_comparisons[:100]
        }

        try:
            with open(f"{output_dir}/results.json", 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, cls=EnhancedJSONEncoder, ensure_ascii=False)
        except Exception as e:
            print(f"Failed to save results: {str(e)}")
            with open(f"{output_dir}/results_fallback.json", 'w', encoding='utf-8') as f:
                json.dump({
                    'metadata': results['metadata'],
                    'summary_stats': results['summary_stats']
                }, f, indent=2, ensure_ascii=False)

def compare_predictions(predictions_file: str, actual_file: str, output_dir: str = "comparison_results"):
    try:
        print(f"Starting comparison at {datetime.now()}")
        comparator = PredictionComparator()

        print("Loading data...")
        comparator.load_data(predictions_file, actual_file)

        if not hasattr(comparator, 'pred_dict') or not hasattr(comparator, 'actual_dict'):
            raise ValueError("Data loading failed - no dictionaries created")

        print("Comparing data...")
        common_users = set(comparator.pred_dict.keys()) & set(comparator.actual_dict.keys())
        print(f"Found {len(common_users)} common users")

        if not common_users:
            raise ValueError("No common users found between predictions and actual data")

        print("Saving results...")
        comparator.save_results(output_dir)

        print(f"Successfully completed at {datetime.now()}")
        return True

    except Exception as e:
        print(f"\nError during comparison: {str(e)}")
        return False

if __name__ == "__main__":
    success = compare_predictions(
        predictions_file="/content/output/predictions.json",
        actual_file="/content/ai020725.json",
        output_dir="comparison_results"
    )



Starting comparison at 2025-07-10 03:46:10.090367
Loading data...
Comparing data...
Found 2614 common users
Saving results...
Successfully completed at 2025-07-10 03:46:14.620967


In [11]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
from typing import Dict, List, Any, Tuple

class PredictionComparator:
    def __init__(self):
        self.comparison_results = []
        self.metrics = {
            'time': {'correct': 0, 'total': 0, 'accuracy': 0.0},
            'place': {'correct': 0, 'total': 0, 'accuracy': 0.0},
            'items': {'correct': 0, 'total': 0, 'precision@3': 0.0}
        }
        self.user_stats = []

    def _clean_data(self, data: Any) -> Any:
        if isinstance(data, (pd.Timestamp, datetime)):
            return data.isoformat()
        elif isinstance(data, (np.integer, np.floating)):
            return int(data) if isinstance(data, np.integer) else float(data)
        elif isinstance(data, np.ndarray):
            return data.tolist()
        elif isinstance(data, dict):
            return {k: self._clean_data(v) for k, v in data.items()}
        elif isinstance(data, (list, tuple)):
            return [self._clean_data(item) for item in data]
        return data

    def load_data(self, predictions_path: str, actual_path: str):
        try:
            with open(predictions_path, 'r', encoding='utf-8') as f:
                self.predictions = json.load(f)

            if actual_path.endswith('.json'):
                with open(actual_path, 'r', encoding='utf-8') as f:
                    actual_data = json.load(f)
            elif actual_path.endswith('.parquet'):
                actual_data = pd.read_parquet(actual_path)
            else:
                actual_data = pd.read_csv(actual_path)

            self.actual_df = pd.DataFrame(actual_data)
            self.actual_df['datetime'] = pd.to_datetime(self.actual_df['timeCheck'])
            self.actual_df['hour'] = self.actual_df['datetime'].dt.hour
            self.actual_df['customerId'] = self.actual_df['customerId'].astype(str)

            self.pred_dict = {p['customerId']: p for p in self.predictions}
            self.actual_dict = defaultdict(list)

            for _, row in self.actual_df.iterrows():
                self.actual_dict[row['customerId']].append(row.to_dict())

            print(f"Loaded {len(self.predictions)} predictions and {len(self.actual_df)} actual records")

        except Exception as e:
            raise ValueError(f"Data loading error: {str(e)}")

    def _compare_time(self, pred: Dict, actual: Dict) -> bool:
        pred_hour = pred.get('predicted_hour', -1)
        actual_hour = actual['datetime'].hour

        if pred_hour == -1:
            return False

        is_correct = abs(pred_hour - actual_hour) <= 1
        self.metrics['time']['correct'] += int(is_correct)
        self.metrics['time']['total'] += 1

        return is_correct

    def _compare_place(self, pred: Dict, actual: Dict) -> bool:
        pred_place = pred.get('predicted_place', '')
        actual_place = actual.get('pointName', '')

        is_correct = pred_place == actual_place
        self.metrics['place']['correct'] += int(is_correct)
        self.metrics['place']['total'] += 1

        return is_correct

    def _compare_items(self, pred: Dict, actual: Dict) -> bool:
        pred_items = pred.get('top_items', [])
        actual_item = actual.get('goodsName', '')

        is_correct = actual_item in pred_items[:3]
        self.metrics['items']['correct'] += int(is_correct)
        self.metrics['items']['total'] += 1

        return is_correct

    def compare_all(self):
        common_users = set(self.pred_dict.keys()) & set(self.actual_dict.keys())
        print(f"Found {len(common_users)} common users for comparison")

        for user_id in tqdm(common_users, desc="Comparing users"):
            pred = self.pred_dict[user_id]
            actual_visits = self.actual_dict[user_id]

            if not actual_visits:
                continue

            last_visit = actual_visits[-1]

            time_correct = self._compare_time(pred, last_visit)
            place_correct = self._compare_place(pred, last_visit)
            items_correct = self._compare_items(pred, last_visit)

            self.user_stats.append({
                'user_id': user_id,
                'time_correct': time_correct,
                'place_correct': place_correct,
                'items_correct': items_correct,
                'actual_time': last_visit['datetime'].hour,
                'predicted_time': pred.get('predicted_hour', None),
                'actual_place': last_visit.get('pointName', ''),
                'predicted_place': pred.get('predicted_place', ''),
                'actual_item': last_visit.get('goodsName', ''),
                'predicted_items': pred.get('top_items', [])
            })

    def calculate_metrics(self):

        if self.metrics['time']['total'] > 0:
            self.metrics['time']['accuracy'] = round(
                self.metrics['time']['correct'] / self.metrics['time']['total'], 3)

        if self.metrics['place']['total'] > 0:
            self.metrics['place']['accuracy'] = round(
                self.metrics['place']['correct'] / self.metrics['place']['total'], 3)

        if self.metrics['items']['total'] > 0:
            self.metrics['items']['precision@3'] = round(
                self.metrics['items']['correct'] / self.metrics['items']['total'], 3)

    def print_summary(self):
        """Вывод статистики в консоль"""
        print("\n=== Comparison Summary ===")
        print(f"Total users compared: {len(self.user_stats)}")

        print("\nTime Prediction:")
        print(f"Accuracy (±1 hour): {self.metrics['time']['accuracy']} "
              f"({self.metrics['time']['correct']}/{self.metrics['time']['total']})")

        print("\nPlace Prediction:")
        print(f"Accuracy: {self.metrics['place']['accuracy']} "
              f"({self.metrics['place']['correct']}/{self.metrics['place']['total']})")

        print("\nItem Recommendations:")
        print(f"Precision@3: {self.metrics['items']['precision@3']} "
              f"({self.metrics['items']['correct']}/{self.metrics['items']['total']})")

        success_example = next((u for u in self.user_stats if u['time_correct'] and u['place_correct'] and u['items_correct']), None)
        fail_example = next((u for u in self.user_stats if not (u['time_correct'] and u['place_correct'] and u['items_correct'])), None)

        if success_example:
            print("\nSuccessful Prediction Example:")
            print(f"User: {success_example['user_id']}")
            print(f"Time: Predicted {success_example['predicted_time']}, Actual {success_example['actual_time']}")
            print(f"Place: Predicted '{success_example['predicted_place']}', Actual '{success_example['actual_place']}'")
            print(f"Item: Actual '{success_example['actual_item']}' in top-3: {success_example['actual_item'] in success_example['predicted_items'][:3]}")

        if fail_example:
            print("\nFailed Prediction Example:")
            print(f"User: {fail_example['user_id']}")
            print(f"Time: Predicted {fail_example['predicted_time']}, Actual {fail_example['actual_time']}")
            print(f"Place: Predicted '{fail_example['predicted_place']}', Actual '{fail_example['actual_place']}'")
            print(f"Item: Actual '{fail_example['actual_item']}' in top-3: {fail_example['actual_item'] in fail_example['predicted_items'][:3]}")

    def save_results(self, output_dir: str):
        os.makedirs(output_dir, exist_ok=True)

        result = {
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'total_users': len(self.user_stats),
                'time_window_used': '±1 hour'
            },
            'metrics': self.metrics,
            'user_stats_sample': self.user_stats[:1000],  # Сохраняем выборку
            'comparison_summary': {
                'perfect_predictions': sum(
                    1 for u in self.user_stats
                    if u['time_correct'] and u['place_correct'] and u['items_correct']
                ),
                'failed_predictions': sum(
                    1 for u in self.user_stats
                    if not (u['time_correct'] or u['place_correct'] or u['items_correct'])
                )
            }
        }

        with open(f"{output_dir}/comparison_results.json", 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False, cls=EnhancedJSONEncoder)

        print(f"\nFull results saved to {output_dir}/comparison_results.json")

def run_comparison(predictions_path: str, actual_path: str, output_dir: str):
    print("Starting prediction comparison...")
    comparator = PredictionComparator()

    try:
        print("Loading data...")
        comparator.load_data(predictions_path, actual_path)

        print("Running comparisons...")
        comparator.compare_all()

        comparator.calculate_metrics()

        comparator.print_summary()

        comparator.save_results(output_dir)

        print("\nComparison completed successfully!")
        return True
    except Exception as e:
        print(f"\nError during comparison: {str(e)}")
        return False

if __name__ == "__main__":


    predictions_file = "/content/output/predictions.json"
    actual_file = "/content/ai020725.json"
    output_dir = "/content/comparison_results"

    run_comparison(
        predictions_path=predictions_file,
        actual_path=actual_file,
        output_dir=output_dir
    )

Starting prediction comparison...
Loading data...
Loaded 6094 predictions and 50000 actual records
Running comparisons...
Found 2614 common users for comparison


Comparing users: 100%|██████████| 2614/2614 [00:00<00:00, 166579.21it/s]


=== Comparison Summary ===
Total users compared: 2614

Time Prediction:
Accuracy (±1 hour): 0.453 (1185/2614)

Place Prediction:
Accuracy: 0.0 (0/2614)

Item Recommendations:
Precision@3: 0.076 (198/2614)

Failed Prediction Example:
User: 1319037
Time: Predicted 14, Actual 18
Place: Predicted '['Ангарск 6, 86 кв-л, д. 40, пом. 57']', Actual 'Улан-Удэ 4, ул.Смолина, д. 54'
Item: Actual 'Пиво  "Крушовице  Velvet"  светлое  5%  жб  0,43л  (24шт)' in top-3: False

Full results saved to /content/comparison_results/comparison_results.json

Comparison completed successfully!



