In [1]:
import pandas as pd
import re

df = pd.read_csv('PG_GET_FULL_ALL_202405171656.csv', low_memory=False)

df['startSaleDate'] = pd.to_datetime(df['startSaleDate'], errors='coerce')

filtered_df = df[df['startSaleDate'].dt.year.isin([2023, 2024])]

exclude_types = ['Коттедж', 'Пентхаус','Сплитхаус', 'Квадрохаус', 'Таунхаус', 'Дуплекс', 'Участок', 'Кладовка', 'Паркинг', 'Офис', 'Наземный паркинг', 'Гараж', 'Цоколь']
filtered_df = filtered_df[~filtered_df['PomeshcheniyeType'].isin(exclude_types)]

In [2]:
filtered_df['PomeshcheniyeType'].unique()

array(['Квартира'], dtype=object)

In [3]:
def extract_complex_name(name):
    match = re.match(r'^(.*?)\s[А-Яа-я]*$', name)
    return match.group(1).strip() if match else name.strip()

filtered_df['ComplexName'] = filtered_df['ObjectName'].apply(extract_complex_name)

complex_areas = filtered_df.groupby('ComplexName')['Area'].sum().reset_index()
complex_areas.columns = ['Complex', 'TotalArea']

average_area = complex_areas['TotalArea'].mean()

num_complexes = complex_areas.shape[0]

description = f'The analysis covers {num_complexes} building complexes for the period of 2023-2024.'

print(complex_areas)
print(f'Average area for complexes (2023-2024): {average_area:.2f} m²')
print(description)

                         Complex  TotalArea
0     4YOU Shymkent Business 1-1   16513.56
1     4YOU Shymkent Business 1-2   11518.04
2    4YOU Shymkent Bussiness 1-3    1950.17
3                   ALA Park - 4   15732.61
4                      Aisar - 2   15840.22
..                           ...        ...
121               Клубный Дом 44    5395.23
122         Поколение Бизнес - 2   16064.80
123         Поколение Бизнес - 3   11582.80
124                   Ұлы Дала 1   17188.28
125                   Ұлы Дала 2   21065.69

[126 rows x 2 columns]
Average area for complexes (2023-2024): 14535.07 m²
The analysis covers 126 building complexes for the period of 2023-2024.
