In [1]:
# !pip install --upgrade transformers==4.49.0
# !pip install optuna==2.10.0
# !pip install numpy==1.26.4 gensim==4.3.2
# !pip install scipy==1.12.0
# !pip install --upgrade pandas==2.2.2
# !pip install h3
# !pip install mlflow
# !pip install 'protobuf<4'
# !pip install selenium
# !pip install natasha
# !pip install pymystem3
# !pip install symspellpy
!rm -rf /content/Price-prediction-with-textual-data
!git clone https://github.com/anna-k-00/Price-prediction-with-textual-data.git

Cloning into 'Price-prediction-with-textual-data'...
remote: Enumerating objects: 308, done.[K
remote: Counting objects: 100% (197/197), done.[K
remote: Compressing objects: 100% (190/190), done.[K
remote: Total 308 (delta 114), reused 6 (delta 6), pack-reused 111 (from 1)[K
Receiving objects: 100% (308/308), 1.80 MiB | 25.64 MiB/s, done.
Resolving deltas: 100% (156/156), done.


In [2]:
# Шаг 1: Проверка и настройка окружения
import os
import sys
import importlib

# Шаг 2: Клонирование/обновление репозитория
repo_url = 'https://github.com/anna-k-00/Price-prediction-with-textual-data.git'
repo_dir = 'Price-prediction-with-textual-data'

if not os.path.exists(repo_dir):
    !git clone {repo_url}
else:
    !cd {repo_dir} && git pull

# Шаг 3: Добавляем все нужные пути в sys.path
paths_to_add = [
    f'/content/{repo_dir}',                     # Для файлов в корне (parser_avito.py)
    f'/content/{repo_dir}/main_methods',        # Основные модули
    f'/content/{repo_dir}/embeddings_generation', # Генерация эмбеддингов
    f'/content/{repo_dir}/preprocessors'        # Препроцессоры
]

for path in paths_to_add:
    if os.path.exists(path) and path not in sys.path:
        sys.path.insert(0, path)
        print(f'Добавлен путь: {path}')

# Шаг 4: Собираем список всех модулей для импорта
all_modules = [
    # Основные модули
    'resource_monitor', 'ANN', 'predict', 'test_pipeline',

    # Модули из embeddings_generation
    'embeddings_generation.rubert_fine_tuning',
    'embeddings_generation.tfidf_generator',
    'embeddings_generation.w2v_generator',
    'embeddings_generation.gate',

    # Модули из preprocessors
    'preprocessors.preprocessor_params_hex',
    'preprocessors.preprocessor_text',

    # Отдельные файлы в корне
    'parser_avito'
]

# Шаг 5: Импортируем все модули
imported_modules = {}
failed_modules = {}

for module_name in all_modules:
    try:
        module = importlib.import_module(module_name)
        imported_modules[module_name] = module
        print(f'✅ {module_name} успешно импортирован')
    except Exception as e:
        failed_modules[module_name] = str(e)
        print(f'❌ Ошибка импорта {module_name}: {str(e)[:200]}')  # Обрезаем длинные сообщения

Already up to date.
Добавлен путь: /content/Price-prediction-with-textual-data
Добавлен путь: /content/Price-prediction-with-textual-data/main_methods
Добавлен путь: /content/Price-prediction-with-textual-data/embeddings_generation
Добавлен путь: /content/Price-prediction-with-textual-data/preprocessors
✅ resource_monitor успешно импортирован
✅ ANN успешно импортирован
✅ predict успешно импортирован
✅ test_pipeline успешно импортирован
✅ embeddings_generation.rubert_fine_tuning успешно импортирован
✅ embeddings_generation.tfidf_generator успешно импортирован
✅ embeddings_generation.w2v_generator успешно импортирован
✅ embeddings_generation.gate успешно импортирован
✅ preprocessors.preprocessor_params_hex успешно импортирован
✅ preprocessors.preprocessor_text успешно импортирован
✅ parser_avito успешно импортирован


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
from parser_avito import AvitoParser
import pandas as pd

# it's important thatt from google colaboratory parser will not work so it is better to run locally
# process = AvitoParser(
#                       target_types = ['house'], # if empty parses all categories
#                       target_highways = ['Фряновское шоссе'], # if empty parses all highways
#                        path_links='raw_links_house.txt',
#                       df_path='data_new_house.csv',
#                       drop_prev_files=False,
#                       parse_new_links=True
#                       )
# process.initializer()

# df = pd.read_csv('data_new_house.csv')

In [8]:

df = pd.read_csv('/content/drive/MyDrive/thesis/data_new_house.csv')

In [5]:
# #preprocess text

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

# # Для русского языка дополнительно:
# try:
#     nltk.data.find('tokenizers/punkt/russian.pickle')
# except LookupError:
#     nltk.download('punkt_tab')  # Специфичные данные для русского языка

# # !wget http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz
# # !tar -xvzf mystem-3.1-linux-64bit.tar.gz
# # !chmod +x mystem
# # !mv mystem /usr/local/bin/

In [9]:
from preprocessor_text import TextPreprocessor

process = TextPreprocessor(df,
                  text_columns = ['description'],
                  fix_spelling = True,
                 lemmatize_text = True,
                 remove_stopwords = True,
                 remove_punctuation = True,
                             )
df_text_prep = process.process_dataframe()


Processing description: 100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


In [11]:
# this is how final df is gathered
# description_raw left raw for rubert

df = df.rename(columns = {'description':'description_raw'})
df['description'] = df_text_prep['description']


In [None]:
df.to_csv(r'/content/drive/MyDrive/thesis/support/sample_100.csv')