In [None]:
# Устанавливаем библиотеки
!pip install psycopg2-binary scikit-learn pandas numpy

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


In [None]:
import psycopg2
import pandas as pd

def fetch_books_from_db():
    conn = psycopg2.connect(
        host="91.184.241.61",
        database="gutendex",
        user="gutendex",
        password="gutendex"
    )
    query = """
    SELECT
        b.id AS book_id,
        b.title AS title,
        string_agg(DISTINCT p.name, ', ') AS authors,
        string_agg(DISTINCT s.name, ', ') AS subjects,
        sm.text AS description
    FROM books_book b
    LEFT JOIN books_book_authors ba ON b.id = ba.book_id
    LEFT JOIN books_person p ON ba.person_id = p.id
    LEFT JOIN books_book_subjects bsu ON b.id = bsu.book_id
    LEFT JOIN books_subject s ON bsu.subject_id = s.id
    LEFT JOIN books_summary sm ON b.id = sm.book_id
    GROUP BY b.id, sm.text
    LIMIT 1000
    """
    df = pd.read_sql(query, conn)
    conn.close()
    return df

df = fetch_books_from_db()
print(df.shape)
df.head()

  df = pd.read_sql(query, conn)


(1000, 5)


Unnamed: 0,book_id,title,authors,subjects,description
0,1,The Declaration of Independence of the United ...,"Jefferson, Thomas","United States. Declaration of Independence, Un...","""The Declaration of Independence of the United..."
1,2,The United States Bill of Rights: The Ten Orig...,United States,"Civil rights -- United States -- Sources, Unit...","""The United States Bill of Rights"" by United S..."
2,3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)",Presidents -- United States -- Inaugural addre...,"""John F. Kennedy's Inaugural Address"" by John ..."
3,4,Lincoln's Gettysburg Address: Given November 1...,"Lincoln, Abraham",Consecration of cemeteries -- Pennsylvania -- ...,"""Lincoln's Gettysburg Address"" by Abraham Linc..."
4,5,The United States Constitution,United States,"United States. Constitution, United States -- ...","""The United States Constitution"" by United Sta..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# Заполнение пропусков
df['description'] = df['description'].fillna('')
df['authors'] = df['authors'].fillna('unknown')
df['subjects'] = df['subjects'].fillna('unknown')

# TF-IDF для описания
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
desc_tfidf = tfidf.fit_transform(df['description'])

# One-Hot для авторов и жанров
encoder = OneHotEncoder(handle_unknown='ignore')
categorical = encoder.fit_transform(df[['authors', 'subjects']])

# Объединение признаков
features = hstack([categorical, desc_tfidf])

# Сохранение
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(encoder, 'onehot_encoder.pkl')
joblib.dump(features, 'book_features_matrix.pkl')
joblib.dump(df[['book_id', 'title']], 'books_metadata.pkl')

['books_metadata.pkl']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Шаг 4: Функция рекомендаций
def recommend_books(favorite_book_ids, top_n=10):
    metadata = joblib.load('books_metadata.pkl')
    features = joblib.load('book_features_matrix.pkl')

    # Преобразуем в numpy array, если это sparse matrix
    if hasattr(features, "toarray"):
        features_array = features.toarray()
    else:
        features_array = np.array(features)

    # Получаем индексы любимых книг
    favorite_indices = metadata.index[metadata['book_id'].isin(favorite_book_ids)].tolist()

    # Среднее по векторам его любимых книг
    mean_vector = features_array[favorite_indices].mean(axis=0)

    # Вычисляем косинусное расстояние
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(features_array, mean_vector.reshape(1, -1))

    # Формируем результат
    results = metadata.copy()
    results['similarity'] = similarities

    # Исключаем уже прочитанные книги и возвращаем топ-N
    recommended = results[~results['book_id'].isin(favorite_book_ids)] \
                    .sort_values(by='similarity', ascending=False) \
                    .head(top_n)

    return recommended

# Пример
favorite_books = [1, 2, 3]
recommendations = recommend_books(favorite_books)
print(recommendations)

     book_id                                              title  similarity
281      282          United States Declaration of Independence    0.338755
4          5                     The United States Constitution    0.290044
232      233      1995 United States Congressional Address Book    0.225561
237      238                   Motion Picture of Rotating Earth    0.224201
115      116     Motion Pictures of the Apollo 11 Lunar Landing    0.224201
220      221                     Radar Map of the United States    0.224201
877      878  The Jubilee of the Constitution: Delivered at ...    0.109814
593      594                    The Constitution of Japan, 1946    0.107186
594      595      The Constitution of the Empire of Japan, 1889    0.091984
146      147                                       Common Sense    0.089652


In [None]:
from google.colab import files

files.download('tfidf_vectorizer.pkl')
files.download('onehot_encoder.pkl')
files.download('book_features_matrix.pkl')
files.download('books_metadata.pkl')