# **Case Study по модулю Plotly**
**Исполнитель:** [Behzod Jumaev]  
**Дата исполнения:** [15-22.11.2024]

In [70]:
import numpy as np
import pandas as pd
import re
import plotly
import plotly.express as px

from datetime import datetime
from connector import connect_to, read_credentials
from sqlalchemy import text, MetaData, Table, Column, Integer,\
                       String, Date, Numeric, ForeignKey, create_engine
from IPython.display import display



## Блок 1. Знакомство со структурой данных
1.1 Проведите обзор таблиц, которые будут задействованы в вашем анализе, а именно:
  * отобразите выборку датафреймов таблиц (10 строк)
  * укажите размер каждой таблицы

1.2 Проверьте таблицы на наличие пустых значений, и решите что с ними делать. Напишите обоснование своего решения.  
1.3 Проверьте типы данных столбцов в таблицах. В случае несоответствия типа данных содержимому столбцов, приведите их к соответствующему типу.  
1.4 Покажите базовую статистику по каждой таблице - например количество значений, max, min, median, mode, количество уникальных значений и т.д.


In [71]:
tables = ["customers", "territory", "product_category", "product_subcategory", "products", "sales"]


with connect_to() as pg:
    pg.execute(text("set search_path to adv_works;"))
    pg.commit()
    df_customers = pd.read_sql('select * from customers', pg)
    df_territory = pd.read_sql('select * from territory', pg)
    df_product_category = pd.read_sql('select * from product_category', pg)
    df_product_subcategory = pd.read_sql('select * from product_subcategory', pg)
    df_products = pd.read_sql('select * from products', pg)
    df_sales = pd.read_sql('select * from sales', pg)




In [72]:
# file_path = 'adventure_works.xlsx'
# excel_data = pd.ExcelFile(file_path)


# df_customers = excel_data.parse('Customers')
# df_territory = excel_data.parse('Territory')
# df_product_category = excel_data.parse('ProductCategory')
# df_product_subcategory = excel_data.parse('ProductSubCategory')
# df_products = excel_data.parse('Products')
# df_sales = excel_data.parse('Sales')


In [73]:
datasets = {
    "customers": df_customers,
    "territory": df_territory,
    "product_category": df_product_category,
    "product_sub_category": df_product_subcategory,
    "products": df_products,
    "sales": df_sales
}

for name, df in datasets.items():
    print(f"\nТаблица: {name} (Размер: {len(df)} строк)")
    display(df.head(10))  


Таблица: customers (Размер: 18484 строк)


Unnamed: 0,customer_key,geography_key,name,birth_date,marital_status,gender,yearly_income,number_children_at_home,occupation,house_owner_flag,number_cars_owned,address_line1,address_line2,phone,date_first_purchase
0,11602,135,Larry Gill,1977-04-13,S,M,30000.0,0,Clerical,0,1,Am Gallberg 645,,1 (11) 500 555-0125,2004-01-11
1,11603,244,Geoffrey Gonzalez,1977-02-06,S,M,30000.0,0,Clerical,0,1,1538 Golden Meadow,,1 (11) 500 555-0131,2002-07-21
2,11610,269,Blake Collins,1975-04-23,S,M,30000.0,0,Clerical,0,1,4519 Lydia Lane,,1 (11) 500 555-0140,2002-07-13
3,12517,133,Alexa Watson,1977-08-25,S,F,30000.0,0,Clerical,0,1,Residenz Straße 98,,1 (11) 500 555-0191,2004-04-21
4,12518,161,Jacquelyn Dominguez,1977-09-27,S,F,30000.0,0,Clerical,0,1,Werftstr 544,,1 (11) 500 555-0134,2004-02-02
5,12519,265,Casey Gutierrez,1977-12-17,S,M,30000.0,0,Clerical,0,1,2035 Emmons Canyon Lane,,1 (11) 500 555-0115,2003-11-12
6,12714,157,Colleen Lu,1973-07-17,S,F,30000.0,0,Clerical,0,1,Hüttenstr 20995,,1 (11) 500 555-0174,2004-01-20
7,12728,131,Jeremiah Stewart,1979-06-26,S,M,30000.0,0,Clerical,0,1,Alte Landstr 9902,,1 (11) 500 555-0129,2003-10-09
8,12871,233,Leah Li,1976-10-06,S,F,30000.0,0,Clerical,0,1,9405 Curletto Dr.,,1 (11) 500 555-0122,2004-05-12
9,13671,173,Frank Ramos,1974-02-07,S,M,30000.0,0,Clerical,0,1,Carlsplatz 43,,1 (11) 500 555-0146,2004-07-31



Таблица: territory (Размер: 11 строк)


Unnamed: 0,territory_key,region,country,group_name
0,1,Northwest,United States,North America
1,2,Northeast,United States,North America
2,3,Central,United States,North America
3,4,Southwest,United States,North America
4,5,Southeast,United States,North America
5,6,Canada,Canada,North America
6,7,France,France,Europe
7,8,Germany,Germany,Europe
8,9,Australia,Australia,Pacific
9,10,United Kingdom,United Kingdom,Europe



Таблица: product_category (Размер: 4 строк)


Unnamed: 0,product_category_key,product_category_alternate_key,english_product_category_name,spanish_product_category_name,french_product_category_name
0,1,1,Bikes,Bicicleta,Vélo
1,2,2,Components,Componente,Composant
2,3,3,Clothing,Prenda,Vêtements
3,4,4,Accessories,Accesorio,Accessoire



Таблица: product_sub_category (Размер: 37 строк)


Unnamed: 0,product_subcategory_key,product_subcategory_alternate_key,english_product_subcategory_name,spanish_product_subcategory_name,french_product_subcategory_name,product_category_key
0,1,1,Mountain Bikes,Bicicleta de montaña,VTT,1
1,2,2,Road Bikes,Bicicleta de carretera,Vélo de route,1
2,3,3,Touring Bikes,Bicicleta de paseo,Vélo de randonnée,1
3,4,4,Handlebars,Barra,Barre d'appui,2
4,5,5,Bottom Brackets,Eje de pedalier,Axe de pédalier,2
5,6,6,Brakes,Frenos,Freins,2
6,7,7,Chains,Cadena,Chaîne,2
7,8,8,Cranksets,Bielas,Pédalier,2
8,9,9,Derailleurs,Desviador,Dérailleur,2
9,10,10,Forks,Horquilla,Fourche,2



Таблица: products (Размер: 397 строк)


Unnamed: 0,product_key,product_subcategory_key,product_name,standard_cost,color,safety_stock_level,list_price,size,size_range,weight,days_to_manufacture,product_line,dealer_price,class,model_name,description,start_date,end_date,status
0,362,1,"Mountain-200 Black, 46",1105.81,Black,100,2049.0982,46,42-46 CM,24.13,4,M,1229.4589,H,Mountain-200,Serious back-country riding. Perfect for all l...,2002-07-01,2003-06-30,
1,363,1,"Mountain-200 Black, 46",1251.9813,Black,100,2294.99,46,42-46 CM,24.13,4,M,1376.994,H,Mountain-200,Serious back-country riding. Perfect for all l...,2003-07-01,,Current
2,364,1,"Mountain-300 Black, 38",598.4354,Black,100,1079.99,38,38-40 CM,25.35,4,M,647.994,M,Mountain-300,For true trail addicts. An extremely durable ...,2002-07-01,2003-06-30,
3,365,1,"Mountain-300 Black, 40",598.4354,Black,100,1079.99,40,38-40 CM,25.77,4,M,647.994,M,Mountain-300,For true trail addicts. An extremely durable ...,2002-07-01,2003-06-30,
4,587,1,"Mountain-400-W Silver, 38",419.7784,Silver,100,769.49,38,38-40 CM,26.35,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
5,588,1,"Mountain-400-W Silver, 40",419.7784,Silver,100,769.49,40,38-40 CM,26.77,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
6,589,1,"Mountain-400-W Silver, 42",419.7784,Silver,100,769.49,42,42-46 CM,27.13,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
7,590,1,"Mountain-400-W Silver, 46",419.7784,Silver,100,769.49,46,42-46 CM,27.42,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
8,344,1,"Mountain-100 Silver, 38",1912.1544,Silver,100,3399.99,38,38-40 CM,20.35,4,M,2039.994,H,Mountain-100,Top-of-the-line competition mountain bike. Per...,2001-07-01,2002-06-30,
9,345,1,"Mountain-100 Silver, 42",1912.1544,Silver,100,3399.99,42,42-46 CM,20.77,4,M,2039.994,H,Mountain-100,Top-of-the-line competition mountain bike. Per...,2001-07-01,2002-06-30,



Таблица: sales (Размер: 60398 строк)


Unnamed: 0,product_key,order_date,order_date_key,customer_key,sales_territory_key,sales_order_number,sales_order_line_number,order_quantity,unit_price,extended_amount,unit_price_discount_pct,discount_amount,product_standard_cost,total_product_cost,sales_amount,tax_amt,freight,region_month_id
0,528,2003-09-29,20030929,16115,4,SO55161,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest9
1,528,2003-10-01,20031001,15307,4,SO55352,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
2,528,2003-10-05,20031005,16003,4,SO55578,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
3,528,2003-10-06,20031006,15883,4,SO55635,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
4,528,2003-10-08,20031008,15368,4,SO55767,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
5,528,2003-10-08,20031008,15273,4,SO55770,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
6,528,2003-10-08,20031008,11499,4,SO55782,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
7,528,2003-10-08,20031008,12960,4,SO55786,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
8,528,2003-10-10,20031010,15875,4,SO55892,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
9,528,2003-10-11,20031011,16238,4,SO55943,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10


In [74]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000) 

In [75]:
for name, df in datasets.items():
    print(f"\nТаблица: {name} (Размер: {len(df)} строк, {df.shape[1]} столбцов)")
    display(df.head(10))



Таблица: customers (Размер: 18484 строк, 15 столбцов)


Unnamed: 0,customer_key,geography_key,name,birth_date,marital_status,gender,yearly_income,number_children_at_home,occupation,house_owner_flag,number_cars_owned,address_line1,address_line2,phone,date_first_purchase
0,11602,135,Larry Gill,1977-04-13,S,M,30000.0,0,Clerical,0,1,Am Gallberg 645,,1 (11) 500 555-0125,2004-01-11
1,11603,244,Geoffrey Gonzalez,1977-02-06,S,M,30000.0,0,Clerical,0,1,1538 Golden Meadow,,1 (11) 500 555-0131,2002-07-21
2,11610,269,Blake Collins,1975-04-23,S,M,30000.0,0,Clerical,0,1,4519 Lydia Lane,,1 (11) 500 555-0140,2002-07-13
3,12517,133,Alexa Watson,1977-08-25,S,F,30000.0,0,Clerical,0,1,Residenz Straße 98,,1 (11) 500 555-0191,2004-04-21
4,12518,161,Jacquelyn Dominguez,1977-09-27,S,F,30000.0,0,Clerical,0,1,Werftstr 544,,1 (11) 500 555-0134,2004-02-02
5,12519,265,Casey Gutierrez,1977-12-17,S,M,30000.0,0,Clerical,0,1,2035 Emmons Canyon Lane,,1 (11) 500 555-0115,2003-11-12
6,12714,157,Colleen Lu,1973-07-17,S,F,30000.0,0,Clerical,0,1,Hüttenstr 20995,,1 (11) 500 555-0174,2004-01-20
7,12728,131,Jeremiah Stewart,1979-06-26,S,M,30000.0,0,Clerical,0,1,Alte Landstr 9902,,1 (11) 500 555-0129,2003-10-09
8,12871,233,Leah Li,1976-10-06,S,F,30000.0,0,Clerical,0,1,9405 Curletto Dr.,,1 (11) 500 555-0122,2004-05-12
9,13671,173,Frank Ramos,1974-02-07,S,M,30000.0,0,Clerical,0,1,Carlsplatz 43,,1 (11) 500 555-0146,2004-07-31



Таблица: territory (Размер: 11 строк, 4 столбцов)


Unnamed: 0,territory_key,region,country,group_name
0,1,Northwest,United States,North America
1,2,Northeast,United States,North America
2,3,Central,United States,North America
3,4,Southwest,United States,North America
4,5,Southeast,United States,North America
5,6,Canada,Canada,North America
6,7,France,France,Europe
7,8,Germany,Germany,Europe
8,9,Australia,Australia,Pacific
9,10,United Kingdom,United Kingdom,Europe



Таблица: product_category (Размер: 4 строк, 5 столбцов)


Unnamed: 0,product_category_key,product_category_alternate_key,english_product_category_name,spanish_product_category_name,french_product_category_name
0,1,1,Bikes,Bicicleta,Vélo
1,2,2,Components,Componente,Composant
2,3,3,Clothing,Prenda,Vêtements
3,4,4,Accessories,Accesorio,Accessoire



Таблица: product_sub_category (Размер: 37 строк, 6 столбцов)


Unnamed: 0,product_subcategory_key,product_subcategory_alternate_key,english_product_subcategory_name,spanish_product_subcategory_name,french_product_subcategory_name,product_category_key
0,1,1,Mountain Bikes,Bicicleta de montaña,VTT,1
1,2,2,Road Bikes,Bicicleta de carretera,Vélo de route,1
2,3,3,Touring Bikes,Bicicleta de paseo,Vélo de randonnée,1
3,4,4,Handlebars,Barra,Barre d'appui,2
4,5,5,Bottom Brackets,Eje de pedalier,Axe de pédalier,2
5,6,6,Brakes,Frenos,Freins,2
6,7,7,Chains,Cadena,Chaîne,2
7,8,8,Cranksets,Bielas,Pédalier,2
8,9,9,Derailleurs,Desviador,Dérailleur,2
9,10,10,Forks,Horquilla,Fourche,2



Таблица: products (Размер: 397 строк, 19 столбцов)


Unnamed: 0,product_key,product_subcategory_key,product_name,standard_cost,color,safety_stock_level,list_price,size,size_range,weight,days_to_manufacture,product_line,dealer_price,class,model_name,description,start_date,end_date,status
0,362,1,"Mountain-200 Black, 46",1105.81,Black,100,2049.0982,46,42-46 CM,24.13,4,M,1229.4589,H,Mountain-200,Serious back-country riding. Perfect for all l...,2002-07-01,2003-06-30,
1,363,1,"Mountain-200 Black, 46",1251.9813,Black,100,2294.99,46,42-46 CM,24.13,4,M,1376.994,H,Mountain-200,Serious back-country riding. Perfect for all l...,2003-07-01,,Current
2,364,1,"Mountain-300 Black, 38",598.4354,Black,100,1079.99,38,38-40 CM,25.35,4,M,647.994,M,Mountain-300,For true trail addicts. An extremely durable ...,2002-07-01,2003-06-30,
3,365,1,"Mountain-300 Black, 40",598.4354,Black,100,1079.99,40,38-40 CM,25.77,4,M,647.994,M,Mountain-300,For true trail addicts. An extremely durable ...,2002-07-01,2003-06-30,
4,587,1,"Mountain-400-W Silver, 38",419.7784,Silver,100,769.49,38,38-40 CM,26.35,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
5,588,1,"Mountain-400-W Silver, 40",419.7784,Silver,100,769.49,40,38-40 CM,26.77,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
6,589,1,"Mountain-400-W Silver, 42",419.7784,Silver,100,769.49,42,42-46 CM,27.13,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
7,590,1,"Mountain-400-W Silver, 46",419.7784,Silver,100,769.49,46,42-46 CM,27.42,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,,Current
8,344,1,"Mountain-100 Silver, 38",1912.1544,Silver,100,3399.99,38,38-40 CM,20.35,4,M,2039.994,H,Mountain-100,Top-of-the-line competition mountain bike. Per...,2001-07-01,2002-06-30,
9,345,1,"Mountain-100 Silver, 42",1912.1544,Silver,100,3399.99,42,42-46 CM,20.77,4,M,2039.994,H,Mountain-100,Top-of-the-line competition mountain bike. Per...,2001-07-01,2002-06-30,



Таблица: sales (Размер: 60398 строк, 18 столбцов)


Unnamed: 0,product_key,order_date,order_date_key,customer_key,sales_territory_key,sales_order_number,sales_order_line_number,order_quantity,unit_price,extended_amount,unit_price_discount_pct,discount_amount,product_standard_cost,total_product_cost,sales_amount,tax_amt,freight,region_month_id
0,528,2003-09-29,20030929,16115,4,SO55161,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest9
1,528,2003-10-01,20031001,15307,4,SO55352,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
2,528,2003-10-05,20031005,16003,4,SO55578,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
3,528,2003-10-06,20031006,15883,4,SO55635,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
4,528,2003-10-08,20031008,15368,4,SO55767,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
5,528,2003-10-08,20031008,15273,4,SO55770,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
6,528,2003-10-08,20031008,11499,4,SO55782,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
7,528,2003-10-08,20031008,12960,4,SO55786,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
8,528,2003-10-10,20031010,15875,4,SO55892,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
9,528,2003-10-11,20031011,16238,4,SO55943,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10


Выводы: ознакомление с данными показало, что таблицы имеют разную структуру и объем данных, что требует разных подходов для дальнейшего анализа.

**Работа с пустыми значниеями**

In [76]:
print("\nПроверка на наличие пустых значений:")
for name, df in datasets.items():
    print(f"\nТаблица: {name}")
    display(df.isnull().sum())


Проверка на наличие пустых значений:

Таблица: customers


customer_key                   0
geography_key                  0
name                           0
birth_date                     0
marital_status                 0
gender                         0
yearly_income                  0
number_children_at_home        0
occupation                     0
house_owner_flag               0
number_cars_owned              0
address_line1                  0
address_line2              18172
phone                          0
date_first_purchase            0
dtype: int64


Таблица: territory


territory_key    0
region           1
country          1
group_name       1
dtype: int64


Таблица: product_category


product_category_key              0
product_category_alternate_key    0
english_product_category_name     0
spanish_product_category_name     0
french_product_category_name      0
dtype: int64


Таблица: product_sub_category


product_subcategory_key              0
product_subcategory_alternate_key    0
english_product_subcategory_name     0
spanish_product_subcategory_name     0
french_product_subcategory_name      0
product_category_key                 0
dtype: int64


Таблица: products


product_key                  0
product_subcategory_key      0
product_name                 0
standard_cost                2
color                       56
safety_stock_level           0
list_price                   2
size                        98
size_range                  98
weight                     122
days_to_manufacture          0
product_line                17
dealer_price                 2
class                       85
model_name                   0
description                  1
start_date                   0
end_date                   197
status                     200
dtype: int64


Таблица: sales


product_key                0
order_date                 0
order_date_key             0
customer_key               0
sales_territory_key        0
sales_order_number         0
sales_order_line_number    0
order_quantity             0
unit_price                 0
extended_amount            0
unit_price_discount_pct    0
discount_amount            0
product_standard_cost      0
total_product_cost         0
sales_amount               0
tax_amt                    0
freight                    0
region_month_id            0
dtype: int64

In [77]:
def process_products(df):
    df = df.dropna(subset=['product_key', 'product_subcategory_key']).copy()

    for col in ['standard_cost', 'list_price', 'dealer_price', 'weight']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    for col in ['product_name', 'color', 'product_line', 'class', 'description', 'status', 'size', 'size_range']:
        if col in df.columns:
            df[col] = df[col].fillna("Unknown")
    
    if 'end_date' in df.columns:
        df['end_date'] = df['end_date'].fillna("9999-12-31")

    return df


df_customers['address_line2'] = df_customers['address_line2'].fillna('Unknown')

df_territory = df_territory.drop(10, axis=0)

df_products = process_products(df_products)


In [78]:
datasets = {
    "customers": df_customers,
    "territory": df_territory,
    "product_category": df_product_category,
    "product_sub_category": df_product_subcategory,
    "products": df_products,
    "sales": df_sales
}

print("\nПроверка на наличие пустых значений:")
for name, df in datasets.items():
    print(f"\nТаблица: {name}")
    display(df.isnull().sum())


Проверка на наличие пустых значений:

Таблица: customers


customer_key               0
geography_key              0
name                       0
birth_date                 0
marital_status             0
gender                     0
yearly_income              0
number_children_at_home    0
occupation                 0
house_owner_flag           0
number_cars_owned          0
address_line1              0
address_line2              0
phone                      0
date_first_purchase        0
dtype: int64


Таблица: territory


territory_key    0
region           0
country          0
group_name       0
dtype: int64


Таблица: product_category


product_category_key              0
product_category_alternate_key    0
english_product_category_name     0
spanish_product_category_name     0
french_product_category_name      0
dtype: int64


Таблица: product_sub_category


product_subcategory_key              0
product_subcategory_alternate_key    0
english_product_subcategory_name     0
spanish_product_subcategory_name     0
french_product_subcategory_name      0
product_category_key                 0
dtype: int64


Таблица: products


product_key                0
product_subcategory_key    0
product_name               0
standard_cost              0
color                      0
safety_stock_level         0
list_price                 0
size                       0
size_range                 0
weight                     0
days_to_manufacture        0
product_line               0
dealer_price               0
class                      0
model_name                 0
description                0
start_date                 0
end_date                   0
status                     0
dtype: int64


Таблица: sales


product_key                0
order_date                 0
order_date_key             0
customer_key               0
sales_territory_key        0
sales_order_number         0
sales_order_line_number    0
order_quantity             0
unit_price                 0
extended_amount            0
unit_price_discount_pct    0
discount_amount            0
product_standard_cost      0
total_product_cost         0
sales_amount               0
tax_amt                    0
freight                    0
region_month_id            0
dtype: int64

Обоснование: если пропущенные значения незначительны и их количество мало, можно удалить эти строки. Если же количество велико или значения критичны, лучше заполнить их средними/медианными значениями (для строк).

### **Работа с не правильними типами данных**

In [79]:
df_customers.head(5)

Unnamed: 0,customer_key,geography_key,name,birth_date,marital_status,gender,yearly_income,number_children_at_home,occupation,house_owner_flag,number_cars_owned,address_line1,address_line2,phone,date_first_purchase
0,11602,135,Larry Gill,1977-04-13,S,M,30000.0,0,Clerical,0,1,Am Gallberg 645,Unknown,1 (11) 500 555-0125,2004-01-11
1,11603,244,Geoffrey Gonzalez,1977-02-06,S,M,30000.0,0,Clerical,0,1,1538 Golden Meadow,Unknown,1 (11) 500 555-0131,2002-07-21
2,11610,269,Blake Collins,1975-04-23,S,M,30000.0,0,Clerical,0,1,4519 Lydia Lane,Unknown,1 (11) 500 555-0140,2002-07-13
3,12517,133,Alexa Watson,1977-08-25,S,F,30000.0,0,Clerical,0,1,Residenz Straße 98,Unknown,1 (11) 500 555-0191,2004-04-21
4,12518,161,Jacquelyn Dominguez,1977-09-27,S,F,30000.0,0,Clerical,0,1,Werftstr 544,Unknown,1 (11) 500 555-0134,2004-02-02


In [80]:
df_customers['house_owner_flag'].unique()

array(['0', '1'], dtype=object)

In [81]:
def normalize_phone_number(phone):
    normalized = re.sub(r"[^\d+]", "", phone)
    
    if not normalized.startswith("+"):
        normalized = "+1" + normalized
    return normalized

df_customers["phone"] = df_customers["phone"].apply(normalize_phone_number)

df_customers['yearly_income'] = df_customers['yearly_income'].astype(np.int64)

df_customers['house_owner_flag'] = df_customers['house_owner_flag'].astype(np.int64)


print(df_customers.dtypes)
display(df_customers.head(5))

customer_key                int64
geography_key               int64
name                       object
birth_date                 object
marital_status             object
gender                     object
yearly_income               int64
number_children_at_home     int64
occupation                 object
house_owner_flag            int64
number_cars_owned           int64
address_line1              object
address_line2              object
phone                      object
date_first_purchase        object
dtype: object


Unnamed: 0,customer_key,geography_key,name,birth_date,marital_status,gender,yearly_income,number_children_at_home,occupation,house_owner_flag,number_cars_owned,address_line1,address_line2,phone,date_first_purchase
0,11602,135,Larry Gill,1977-04-13,S,M,30000,0,Clerical,0,1,Am Gallberg 645,Unknown,11115005550125,2004-01-11
1,11603,244,Geoffrey Gonzalez,1977-02-06,S,M,30000,0,Clerical,0,1,1538 Golden Meadow,Unknown,11115005550131,2002-07-21
2,11610,269,Blake Collins,1975-04-23,S,M,30000,0,Clerical,0,1,4519 Lydia Lane,Unknown,11115005550140,2002-07-13
3,12517,133,Alexa Watson,1977-08-25,S,F,30000,0,Clerical,0,1,Residenz Straße 98,Unknown,11115005550191,2004-04-21
4,12518,161,Jacquelyn Dominguez,1977-09-27,S,F,30000,0,Clerical,0,1,Werftstr 544,Unknown,11115005550134,2004-02-02


In [82]:
display(df_territory.dtypes)
display(df_territory.head(5))

territory_key     int64
region           object
country          object
group_name       object
dtype: object

Unnamed: 0,territory_key,region,country,group_name
0,1,Northwest,United States,North America
1,2,Northeast,United States,North America
2,3,Central,United States,North America
3,4,Southwest,United States,North America
4,5,Southeast,United States,North America


In [83]:
display(df_product_category.dtypes)
display(df_product_category.head(5))

product_category_key               int64
product_category_alternate_key     int64
english_product_category_name     object
spanish_product_category_name     object
french_product_category_name      object
dtype: object

Unnamed: 0,product_category_key,product_category_alternate_key,english_product_category_name,spanish_product_category_name,french_product_category_name
0,1,1,Bikes,Bicicleta,Vélo
1,2,2,Components,Componente,Composant
2,3,3,Clothing,Prenda,Vêtements
3,4,4,Accessories,Accesorio,Accessoire


In [84]:
display(df_product_subcategory.dtypes)
display(df_product_subcategory.head(5))

product_subcategory_key               int64
product_subcategory_alternate_key     int64
english_product_subcategory_name     object
spanish_product_subcategory_name     object
french_product_subcategory_name      object
product_category_key                  int64
dtype: object

Unnamed: 0,product_subcategory_key,product_subcategory_alternate_key,english_product_subcategory_name,spanish_product_subcategory_name,french_product_subcategory_name,product_category_key
0,1,1,Mountain Bikes,Bicicleta de montaña,VTT,1
1,2,2,Road Bikes,Bicicleta de carretera,Vélo de route,1
2,3,3,Touring Bikes,Bicicleta de paseo,Vélo de randonnée,1
3,4,4,Handlebars,Barra,Barre d'appui,2
4,5,5,Bottom Brackets,Eje de pedalier,Axe de pédalier,2


In [85]:
display(df_products.dtypes)
display(df_products.head(5))

product_key                  int64
product_subcategory_key      int64
product_name                object
standard_cost              float64
color                       object
safety_stock_level           int64
list_price                 float64
size                        object
size_range                  object
weight                     float64
days_to_manufacture          int64
product_line                object
dealer_price               float64
class                       object
model_name                  object
description                 object
start_date                  object
end_date                    object
status                      object
dtype: object

Unnamed: 0,product_key,product_subcategory_key,product_name,standard_cost,color,safety_stock_level,list_price,size,size_range,weight,days_to_manufacture,product_line,dealer_price,class,model_name,description,start_date,end_date,status
0,362,1,"Mountain-200 Black, 46",1105.81,Black,100,2049.0982,46,42-46 CM,24.13,4,M,1229.4589,H,Mountain-200,Serious back-country riding. Perfect for all l...,2002-07-01,2003-06-30,Unknown
1,363,1,"Mountain-200 Black, 46",1251.9813,Black,100,2294.99,46,42-46 CM,24.13,4,M,1376.994,H,Mountain-200,Serious back-country riding. Perfect for all l...,2003-07-01,9999-12-31,Current
2,364,1,"Mountain-300 Black, 38",598.4354,Black,100,1079.99,38,38-40 CM,25.35,4,M,647.994,M,Mountain-300,For true trail addicts. An extremely durable ...,2002-07-01,2003-06-30,Unknown
3,365,1,"Mountain-300 Black, 40",598.4354,Black,100,1079.99,40,38-40 CM,25.77,4,M,647.994,M,Mountain-300,For true trail addicts. An extremely durable ...,2002-07-01,2003-06-30,Unknown
4,587,1,"Mountain-400-W Silver, 38",419.7784,Silver,100,769.49,38,38-40 CM,26.35,4,M,461.694,M,Mountain-400-W,This bike delivers a high-level of performance...,2003-07-01,9999-12-31,Current


In [86]:
display(df_sales.dtypes)
display(df_sales.head(5))

product_key                  int64
order_date                  object
order_date_key               int64
customer_key                 int64
sales_territory_key          int64
sales_order_number          object
sales_order_line_number      int64
order_quantity               int64
unit_price                 float64
extended_amount            float64
unit_price_discount_pct    float64
discount_amount            float64
product_standard_cost      float64
total_product_cost         float64
sales_amount               float64
tax_amt                    float64
freight                    float64
region_month_id             object
dtype: object

Unnamed: 0,product_key,order_date,order_date_key,customer_key,sales_territory_key,sales_order_number,sales_order_line_number,order_quantity,unit_price,extended_amount,unit_price_discount_pct,discount_amount,product_standard_cost,total_product_cost,sales_amount,tax_amt,freight,region_month_id
0,528,2003-09-29,20030929,16115,4,SO55161,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest9
1,528,2003-10-01,20031001,15307,4,SO55352,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
2,528,2003-10-05,20031005,16003,4,SO55578,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
3,528,2003-10-06,20031006,15883,4,SO55635,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10
4,528,2003-10-08,20031008,15368,4,SO55767,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10


Выводы: типы данных были приведены в соответствие с содержимым, что позволит корректно выполнять дальнейшие преобразования и анализ.

**Базовые статистики**

In [87]:
print("\nБазовая статистика:")
for name, df in datasets.items():
    print(f"\nТаблица: {name}")
    numeric_summary = df.describe(include=[np.number])
    categorical_summary = df.describe(include=[object])

    display(numeric_summary)
    display(categorical_summary)


Базовая статистика:

Таблица: customers


Unnamed: 0,customer_key,geography_key,yearly_income,number_children_at_home,house_owner_flag,number_cars_owned
count,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0
mean,20241.5,257.956287,57305.77797,1.004058,0.676369,1.502705
std,5336.015523,196.531062,32285.841703,1.52266,0.467874,1.138394
min,11000.0,2.0,10000.0,0.0,0.0,0.0
25%,15620.75,62.0,30000.0,0.0,0.0,1.0
50%,20241.5,240.0,60000.0,0.0,1.0,2.0
75%,24862.25,345.0,70000.0,2.0,1.0,2.0
max,29483.0,654.0,170000.0,5.0,1.0,4.0


Unnamed: 0,name,birth_date,marital_status,gender,occupation,address_line1,address_line2,phone,date_first_purchase
count,18484,18484,18484,18484,18484,18484,18484,18484,18484
unique,18400,8252,2,2,5,12797,167,8890,1124
top,Mohamed Pal,1967-05-14,M,M,Professional,Attaché de Presse,Unknown,11115005550118,2003-08-08
freq,3,12,10011,9351,5520,17,18172,205,56



Таблица: territory


Unnamed: 0,territory_key
count,10.0
mean,5.5
std,3.02765
min,1.0
25%,3.25
50%,5.5
75%,7.75
max,10.0


Unnamed: 0,region,country,group_name
count,10,10,10
unique,10,6,3
top,Northwest,United States,North America
freq,1,5,6



Таблица: product_category


Unnamed: 0,product_category_key,product_category_alternate_key
count,4.0,4.0
mean,2.5,2.5
std,1.290994,1.290994
min,1.0,1.0
25%,1.75,1.75
50%,2.5,2.5
75%,3.25,3.25
max,4.0,4.0


Unnamed: 0,english_product_category_name,spanish_product_category_name,french_product_category_name
count,4,4,4
unique,4,4,4
top,Bikes,Bicicleta,Vélo
freq,1,1,1



Таблица: product_sub_category


Unnamed: 0,product_subcategory_key,product_subcategory_alternate_key,product_category_key
count,37.0,37.0,37.0
mean,19.0,19.0,2.783784
std,10.824355,10.824355,1.003747
min,1.0,1.0,1.0
25%,10.0,10.0,2.0
50%,19.0,19.0,3.0
75%,28.0,28.0,4.0
max,37.0,37.0,4.0


Unnamed: 0,english_product_subcategory_name,spanish_product_subcategory_name,french_product_subcategory_name
count,37,37,37
unique,37,37,37
top,Mountain Bikes,Bicicleta de montaña,VTT
freq,1,1,1



Таблица: products


Unnamed: 0,product_key,product_subcategory_key,standard_cost,safety_stock_level,list_price,weight,days_to_manufacture,dealer_price
count,397.0,397.0,397.0,397.0,397.0,397.0,397.0,397.0
mean,408.0,12.128463,433.108949,282.851385,745.729411,36.907683,1.765743,447.437649
std,114.748275,9.423328,496.389279,220.417534,837.029572,123.794193,1.578913,502.217743
min,210.0,1.0,0.8565,4.0,2.29,2.12,0.0,1.374
25%,309.0,2.0,37.1209,100.0,69.99,2.85,1.0,41.994
50%,408.0,13.0,204.6251,500.0,364.09,15.13,1.0,218.454
75%,507.0,16.0,660.9142,500.0,1204.3248,19.42,4.0,722.5949
max,606.0,37.0,2171.2942,500.0,3578.27,1050.0,4.0,2146.962


Unnamed: 0,product_name,color,size,size_range,product_line,class,model_name,description,start_date,end_date,status
count,397,397,397,397,397,397,397,397,397,397,397
unique,295,10,19,11,5,4,119,115,4,3,2
top,"HL Mountain Frame - Black, 38",Black,Unknown,Unknown,R,H,LL Road Frame,"The LL Frame provides a safe comfortable ride,...",2003-07-01,9999-12-31,Unknown
freq,3,129,98,98,162,121,30,30,195,197,200



Таблица: sales


Unnamed: 0,product_key,order_date_key,customer_key,sales_territory_key,sales_order_line_number,order_quantity,unit_price,extended_amount,unit_price_discount_pct,discount_amount,product_standard_cost,total_product_cost,sales_amount,tax_amt,freight
count,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0,60398.0
mean,437.557932,20035220.0,18841.68542,6.244462,1.886321,1.0,486.086911,486.086911,0.0,0.0,286.065657,286.065657,486.086911,38.886954,12.152217
std,118.08839,6394.891,5432.430404,2.96115,1.016328,0.0,928.489892,928.489892,0.0,0.0,552.457641,552.457641,928.489892,74.279193,23.212248
min,214.0,20010700.0,11000.0,1.0,1.0,1.0,2.29,2.29,0.0,0.0,0.8565,0.8565,2.29,0.1832,0.0573
25%,359.0,20031000.0,14003.0,4.0,1.0,1.0,7.95,7.95,0.0,0.0,2.9733,2.9733,7.95,0.636,0.1988
50%,479.0,20040120.0,18143.0,7.0,2.0,1.0,29.99,29.99,0.0,0.0,11.2163,11.2163,29.99,2.3992,0.7498
75%,529.0,20040420.0,23429.75,9.0,2.0,1.0,539.99,539.99,0.0,0.0,294.5797,294.5797,539.99,43.1992,13.4998
max,606.0,20040730.0,29483.0,10.0,8.0,1.0,3578.27,3578.27,0.0,0.0,2171.2942,2171.2942,3578.27,286.2616,89.4568


Unnamed: 0,order_date,sales_order_number,region_month_id
count,60398,60398,60398
unique,1124,27659,105
top,2004-06-14,SO58845,Australia6
freq,263,8,1359


## 2. Анализ клиентской базы  
2.1 Визуализируйте распределение возраста клиентов с помощью гистограммы и коробочной диаграммы (boxplot). Опишите ваши наблюдения.  
2.2 Проверьте распределение на наличие выбросов и решите что с ними делать. Напишите обоснование своего решения.  
2.3 Создайте диаграммы для отображения разбивки клиентов по регионам и странам. Опишите ваши наблюдения.  
2.4 Визуализируйте распределение персонального дохода клиентов в целом и в разбивке по полу, семейному положению, сфере деятельности и регионам. Опишите ваши наблюдения.  


#### 2.1 Гистограмма и boxplot возраста

In [88]:
df_customers['birth_date'] = pd.to_datetime(df_customers['birth_date'], errors='coerce')
df_customers['age'] = (pd.Timestamp.now() - df_customers['birth_date']).dt.days // 365

fig_age_hist = px.histogram(df_customers, x='age', nbins=20, title="Распределение возраста клиентов (Гистограмма)")
fig_age_hist.show()

fig_age_box = px.box(df_customers, y='age', title="Распределение возраста клиентов (Boxplot)")
fig_age_box.show()




In [89]:
q1 = df_customers['age'].quantile(0.25)
q3 = df_customers['age'].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

df_customers_cleaned = df_customers[(df_customers['age'] >= lower_bound) & (df_customers['age'] <= upper_bound)]


In [90]:
age_dist = df_customers_cleaned['age'].value_counts().reset_index()
age_dist.columns = ['age', 'count']
age_dist.sort_values(by='age').reset_index(drop=True).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51
age,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94
count,9,254,324,403,385,447,461,518,546,557,593,609,648,675,642,654,673,629,635,568,555,522,500,514,501,483,439,427,396,330,332,310,287,292,288,261,210,217,210,176,157,136,118,104,101,94,69,57,46,23,14,10


#### 2.2 Анализ выбросов возраста

In [91]:
age_q1 = df_customers['age'].quantile(0.25)
age_q3 = df_customers['age'].quantile(0.75)
age_iqr = age_q3 - age_q1
age_lower_bound = age_q1 - 1.5 * age_iqr
age_upper_bound = age_q3 + 1.5 * age_iqr

customers_cleaned = df_customers[(df_customers['age'] >= age_lower_bound) & (df_customers['age'] <= age_upper_bound)]
customers_cleaned.head()


Unnamed: 0,customer_key,geography_key,name,birth_date,marital_status,gender,yearly_income,number_children_at_home,occupation,house_owner_flag,number_cars_owned,address_line1,address_line2,phone,date_first_purchase,age
0,11602,135,Larry Gill,1977-04-13,S,M,30000,0,Clerical,0,1,Am Gallberg 645,Unknown,11115005550125,2004-01-11,47
1,11603,244,Geoffrey Gonzalez,1977-02-06,S,M,30000,0,Clerical,0,1,1538 Golden Meadow,Unknown,11115005550131,2002-07-21,47
2,11610,269,Blake Collins,1975-04-23,S,M,30000,0,Clerical,0,1,4519 Lydia Lane,Unknown,11115005550140,2002-07-13,49
3,12517,133,Alexa Watson,1977-08-25,S,F,30000,0,Clerical,0,1,Residenz Straße 98,Unknown,11115005550191,2004-04-21,47
4,12518,161,Jacquelyn Dominguez,1977-09-27,S,F,30000,0,Clerical,0,1,Werftstr 544,Unknown,11115005550134,2004-02-02,47


#### 2.3 Разбивка клиентов по регионам и странам

In [100]:
for name, data in datasets.items():
    print(f'{name}  ||  ||{data.columns}')

customers  ||  ||Index(['customer_key', 'geography_key', 'name', 'birth_date', 'marital_status', 'gender', 'yearly_income', 'number_children_at_home', 'occupation', 'house_owner_flag', 'number_cars_owned', 'address_line1', 'address_line2', 'phone', 'date_first_purchase', 'age'], dtype='object')
territory  ||  ||Index(['territory_key', 'region', 'country', 'group_name'], dtype='object')
product_category  ||  ||Index(['product_category_key', 'product_category_alternate_key', 'english_product_category_name', 'spanish_product_category_name', 'french_product_category_name'], dtype='object')
product_sub_category  ||  ||Index(['product_subcategory_key', 'product_subcategory_alternate_key', 'english_product_subcategory_name', 'spanish_product_subcategory_name', 'french_product_subcategory_name', 'product_category_key'], dtype='object')
products  ||  ||Index(['product_key', 'product_subcategory_key', 'product_name', 'standard_cost', 'color', 'safety_stock_level', 'list_price', 'size', 'size_ran

In [115]:
customers_with_territory = df_customers.merge(df_territory, left_on='geography_key', right_on='territory_key', how='left')

if 'region' in customers_with_territory.columns:
    fig_region_bar = px.bar(customers_with_territory.groupby('region').size().reset_index(name='count'), x='region', y='count', title="Разбивка клиентов по регионам")
    fig_region_bar.update_xaxes(title='Регион')
    fig_region_bar.update_yaxes(title='Количество клиентов')
    fig_region_bar.show()

if 'country' in customers_with_territory.columns:
    fig_country_bar = px.bar(customers_with_territory.groupby('country').size().reset_index(name='count'), x='country', y='count', title="Разбивка клиентов по странам")
    fig_country_bar.update_xaxes(title='Страна')
    fig_country_bar.update_yaxes(title='Количество клиентов')
    fig_country_bar.show()

In [26]:
fig_income_marital = px.box(df_customers, x='marital_status', y='yearly_income', title="Распределение дохода по семейному положению")
fig_income_marital.show()


Выводы: визуализация позволяет увидеть, в каких регионах и странах находится наибольшее количество клиентов, что полезно для таргетинга и маркетинга.

### 2.4 Визуализация распределения персонального дохода клиентов

In [105]:
if 'yearly_income' in df_customers.columns:
    fig = px.histogram(df_customers, x='yearly_income', nbins=20, title='Распределение дохода клиентов')
    fig.update_xaxes(title='Доход')
    fig.update_yaxes(title='Количество клиентов')
    fig.show()

if 'gender' in df_customers.columns:
    fig = px.histogram(df_customers, x='yearly_income', color='gender', nbins=20, title='Распределение дохода клиентов в разбивке по полу')
    fig.update_xaxes(title='Доход')
    fig.update_yaxes(title='Количество клиентов')
    fig.show()

if 'marital_status' in df_customers.columns:
    fig = px.histogram(df_customers, x='yearly_income', color='marital_status', nbins=20, title='Распределение дохода клиентов в разбивке по семейному положению')
    fig.update_xaxes(title='Доход')
    fig.update_yaxes(title='Количество клиентов')
    fig.show()

if 'occupation' in df_customers.columns:
    fig = px.histogram(df_customers, x='yearly_income', color='occupation', nbins=20, title='Распределение дохода клиентов в разбивке по сфере деятельности')
    fig.update_xaxes(title='Доход')
    fig.update_yaxes(title='Количество клиентов')
    fig.show()

if 'region' in customers_with_territory.columns:
    fig = px.histogram(customers_with_territory, x='yearly_income', color='region', nbins=20, title='Распределение дохода клиентов в разбивке по регионам')
    fig.update_xaxes(title='Доход')
    fig.update_yaxes(title='Количество клиентов')
    fig.show()

Выводы: визуализация доходов клиентов по различным категориям (пол, семейное положение, профессия и регион) помогает выявить потенциальные закономерности и различия в доходах.

## 3. Анализ продуктов и продаж   
3.1 Отобразите на диаграмме общую сумму продаж по месяцам и годам. Опишите наблюдается ли в данных сезональность (т.е. есть ли периоды в которых продажи регулярно падают или наоборот растут) и какой прослеживается тренд продаж.  
3.2 Создайте диаграммы, отображающие сумму продаж в разбивке по продуктам, категориям и годам. Определите топ 5 наиболее продаваемых продуктов.  Напишите обоснование своего решения.  
3.3 Визуализируйте распределение цены продуктов и коррелюцию цены и суммы продаж. Опишите свои наблюдения.  
3.4 Разбейте клиентскую базу на сегменты по частоте и общей сумме покупок. Визуализируйте полученные сегменты с помощью диаграмм. Напишите какой сегмент вы считаете наиболее приоритетным и обоснуйте своё решение.  


### 3.1. Общая сумма продаж по месяцам и годам


In [43]:
df_sales['order_date'] = pd.to_datetime(df_sales['order_date'])
df_sales['month'] = df_sales['order_date'].dt.to_period('M')
df_sales['year'] = df_sales['order_date'].dt.year

monthly_sales = df_sales.groupby('month')['sales_amount'].sum().reset_index()
monthly_sales['month'] = monthly_sales['month'].astype(str)

In [44]:
fig = px.line(monthly_sales, x='month', y='sales_amount', title='Общая сумма продаж по месяцам и годам')
fig.update_xaxes(title='Месяц')
fig.update_yaxes(title='Сумма продаж')
fig.show()

Выводы: визуализация продаж по месяцам и годам помогает выявить сезонность и долгосрочные тренды продаж.

### 3.2. Продажи по продуктам, категориям и годам


In [112]:
sales_product = df_sales.merge(df_products, left_on='product_key', right_on='product_key', how='left')

if 'product_category_key' in sales_product.columns and 'product_category_key' in df_product_category.columns:
    sales_product_category = sales_product.merge(df_product_category, left_on='product_category_key', right_on='product_category_key', how='left')
else:
    sales_product_category = sales_product

#### Сумма продаж по продуктам

In [111]:
product_sales = sales_product.groupby('product_name')['sales_amount'].sum().reset_index()
fig = px.bar(product_sales, x='product_name', y='sales_amount', title='Сумма продаж по продуктам')
fig.update_xaxes(title='Продукт')
fig.update_yaxes(title='Сумма продаж')
fig.show()

#### Сумма продаж по категориям

In [52]:
sales_product_category.head()

Unnamed: 0,product_key,order_date,order_date_key,customer_key,sales_territory_key,sales_order_number,sales_order_line_number,order_quantity,unit_price,extended_amount,unit_price_discount_pct,discount_amount,product_standard_cost,total_product_cost,sales_amount,tax_amt,freight,region_month_id,year,month,product_subcategory_key,product_name,standard_cost,color,safety_stock_level,list_price,size,size_range,weight,days_to_manufacture,product_line,dealer_price,class,model_name,description,start_date,end_date,status
0,528,2003-09-29,20030929,16115,4,SO55161,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest9,2003,2003-09,37,Mountain Tire Tube,1.8663,Unknown,500,4.99,Unknown,Unknown,15.13,0,M,2.994,Unknown,Mountain Tire Tube,Self-sealing tube.,2003-07-01,9999-12-31,Current
1,528,2003-10-01,20031001,15307,4,SO55352,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,2003-10,37,Mountain Tire Tube,1.8663,Unknown,500,4.99,Unknown,Unknown,15.13,0,M,2.994,Unknown,Mountain Tire Tube,Self-sealing tube.,2003-07-01,9999-12-31,Current
2,528,2003-10-05,20031005,16003,4,SO55578,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,2003-10,37,Mountain Tire Tube,1.8663,Unknown,500,4.99,Unknown,Unknown,15.13,0,M,2.994,Unknown,Mountain Tire Tube,Self-sealing tube.,2003-07-01,9999-12-31,Current
3,528,2003-10-06,20031006,15883,4,SO55635,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,2003-10,37,Mountain Tire Tube,1.8663,Unknown,500,4.99,Unknown,Unknown,15.13,0,M,2.994,Unknown,Mountain Tire Tube,Self-sealing tube.,2003-07-01,9999-12-31,Current
4,528,2003-10-08,20031008,15368,4,SO55767,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,2003-10,37,Mountain Tire Tube,1.8663,Unknown,500,4.99,Unknown,Unknown,15.13,0,M,2.994,Unknown,Mountain Tire Tube,Self-sealing tube.,2003-07-01,9999-12-31,Current


In [107]:
sales_price_correlation = sales_product.groupby('list_price')['sales_amount'].sum().reset_index()
fig = px.scatter(sales_price_correlation, x='list_price', y='sales_amount', title='Корреляция между ценой и суммой продаж')
fig.update_xaxes(title='Цена продукта')
fig.update_yaxes(title='Сумма продаж')
fig.show()

In [30]:
df_sales.head()

Unnamed: 0,product_key,order_date,order_date_key,customer_key,sales_territory_key,sales_order_number,sales_order_line_number,order_quantity,unit_price,extended_amount,unit_price_discount_pct,discount_amount,product_standard_cost,total_product_cost,sales_amount,tax_amt,freight,region_month_id,year,month
0,528,2003-09-29,20030929,16115,4,SO55161,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest9,2003,9
1,528,2003-10-01,20031001,15307,4,SO55352,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,10
2,528,2003-10-05,20031005,16003,4,SO55578,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,10
3,528,2003-10-06,20031006,15883,4,SO55635,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,10
4,528,2003-10-08,20031008,15368,4,SO55767,1,1,4.99,4.99,0.0,0.0,1.8663,1.8663,4.99,0.3992,0.1248,Southwest10,2003,10


In [31]:
df_product_category.head()

Unnamed: 0,product_category_key,product_category_alternate_key,english_product_category_name,spanish_product_category_name,french_product_category_name
0,1,1,Bikes,Bicicleta,Vélo
1,2,2,Components,Componente,Composant
2,3,3,Clothing,Prenda,Vêtements
3,4,4,Accessories,Accesorio,Accessoire


Выводы: анализ продаж по продуктам и категориям позволяет выявить наиболее популярные товары и категории для дальнейшего фокусирования маркетинговых усилий.

### 3.3. Распределение цен и корреляция с суммой продаж
#### Распределение цен

In [55]:
fig = px.histogram(df_products, x='list_price', nbins=20, title='Распределение цен продуктов')
fig.update_xaxes(title='Цена продукта')
fig.update_yaxes(title='Количество продуктов')
fig.show()

### Корреляция цены и суммы продаж

In [56]:
sales_price_correlation = sales_product.groupby('list_price')['sales_amount'].sum().reset_index()
fig = px.scatter(sales_price_correlation, x='list_price', y='sales_amount', title='Корреляция между ценой и суммой продаж')
fig.update_xaxes(title='Цена продукта')
fig.update_yaxes(title='Сумма продаж')
fig.show()

Выводы: анализ распределения цен и корреляции между ценой и объемом продаж помогает понять, как цена продукта влияет на спрос.

### 3.4. Сегментация клиентов

In [58]:
customer_sales = df_sales.groupby('customer_key').agg({'sales_amount': 'sum', 'order_date': 'count'}).reset_index()
customer_sales.columns = ['customer_key', 'total_sales', 'order_frequency']

fig = px.scatter(customer_sales, x='order_frequency', y='total_sales', title='Сегментация клиентов по частоте покупок и общей сумме')
fig.update_xaxes(title='Частота покупок')
fig.update_yaxes(title='Общая сумма покупок')
fig.show()

Выводы: сегментация клиентов по частоте и сумме покупок позволяет выделить наиболее ценных клиентов, на которых следует фокусировать усилия по удержанию и маркетингу.