<a href="https://colab.research.google.com/github/ancestor9/Data-Analyst-with-Gemini-/blob/main/9%EC%9D%BC%EC%B0%A8/AdventureSales_01_Obtain_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **[Tutorial: From Excel workbook to a Power BI report in Microsoft Teams](https://learn.microsoft.com/en-us/power-bi/create-reports/service-from-excel-to-stunning-report)**

## **1. 데이터 다운로드 하기**

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

>> #### 1.2. Linux Bash 명령어로 다운 받는 법

In [2]:
# Linux Bash 명령어
! wget https://github.com/microsoft/powerbi-desktop-samples/raw/main/AdventureWorks%20Sales%20Sample/AdventureWorks%20Sales.xlsx

--2025-07-03 03:57:23--  https://github.com/microsoft/powerbi-desktop-samples/raw/main/AdventureWorks%20Sales%20Sample/AdventureWorks%20Sales.xlsx
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/microsoft/powerbi-desktop-samples/main/AdventureWorks%20Sales%20Sample/AdventureWorks%20Sales.xlsx [following]
--2025-07-03 03:57:24--  https://raw.githubusercontent.com/microsoft/powerbi-desktop-samples/main/AdventureWorks%20Sales%20Sample/AdventureWorks%20Sales.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14322931 (14M) [application/octet-stream]
Saving to: ‘AdventureWorks Sales.xlsx’


2025-07

In [3]:
# prompt: AdventureWorks Sales.xlsx는 7개의 시트(고객, 상품, 지역, 판매 등)를 모두 읽고 사전형태로 데이터를 저장해 다오
# To read multiple sheets from a single Excel file, we will use the pandas library.
# Below is an example of how you can read all sheets from an Excel file into a dictionary of DataFrames.

import pandas as pd

# Function to read all sheets from an Excel file
def read_excel_sheets(excel_file):
    # Using sheet_name=None reads all sheets, each sheet as a DataFrame in a dictionary
    sheets_dict = pd.read_excel(excel_file, sheet_name=None)
    return sheets_dict

dfs = read_excel_sheets('/content/AdventureWorks Sales.xlsx')
dfs.keys()

dict_keys(['Sales Order_data', 'Sales Territory_data', 'Sales_data', 'Reseller_data', 'Date_data', 'Product_data', 'Customer_data'])

In [4]:
# prompt: dfs의 모든 시트를 csv파일로 각각 저장해줘

import os

# Create a directory to save the CSV files
output_dir = 'csv_sheets'
os.makedirs(output_dir, exist_ok=True)

# Iterate through the sheets and save each as a CSV file
for sheet_name, df in dfs.items():
  # Sanitize sheet names to be valid filenames
  sanitized_sheet_name = sheet_name.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')
  output_filename = os.path.join(output_dir, f'{sanitized_sheet_name}.csv')
  df.to_csv(output_filename, index=False)
  print(f"Saved sheet '{sheet_name}' to '{output_filename}'")

# List the created files (optional)
!ls csv_sheets

Saved sheet 'Sales Order_data' to 'csv_sheets/Sales Order_data.csv'
Saved sheet 'Sales Territory_data' to 'csv_sheets/Sales Territory_data.csv'
Saved sheet 'Sales_data' to 'csv_sheets/Sales_data.csv'
Saved sheet 'Reseller_data' to 'csv_sheets/Reseller_data.csv'
Saved sheet 'Date_data' to 'csv_sheets/Date_data.csv'
Saved sheet 'Product_data' to 'csv_sheets/Product_data.csv'
Saved sheet 'Customer_data' to 'csv_sheets/Customer_data.csv'
 Customer_data.csv   Reseller_data.csv	    'Sales Territory_data.csv'
 Date_data.csv	     Sales_data.csv
 Product_data.csv   'Sales Order_data.csv'


In [None]:
# prompt: tf의 자료형태, 결측치, 고유값, 대표갑 7개를 데이터프레임으로 보여주는 함수를 아주 간단하게 달라. 아주 간단하게, 주석은 뺴고

import pandas as pd
def summary(df):
    analysis_data = []
    for col in df.columns:
        dtype = df[col].dtype
        missing_count = df[col].isnull().sum()
        missing_percentage = missing_count / len(df) * 100
        unique_count = df[col].nunique()
        top_7_values = df[col].value_counts().nlargest(7).index.tolist()
        analysis_data.append([col, dtype, missing_count, missing_percentage, unique_count, top_7_values])
    analysis_df = pd.DataFrame(analysis_data, columns=['Column', 'DataType', 'MissingCount', 'missing_percentage', 'UniqueCount', 'Top7Values'])
    return analysis_df

In [10]:
# prompt: dfs의 모든 데이터프레임에  analyze_dataframe()함수를 적용하여 차례대로 보여줘

from IPython.display import display

for sheet_name, df in dfs.items():
    print(f"Analyzing DataFrame for sheet: {sheet_name}")
    analysis_result = summary(df)
    display(analysis_result)
    print("-" * 100) # Separator for better readability
    print("-" * 100) # Separator for better readability

Analyzing DataFrame for sheet: Sales Order_data


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,Channel,object,0,0.0,2,"[Reseller, Internet]"
1,SalesOrderLineKey,int64,0,0.0,121253,"[75123003, 43659001, 43659002, 43659003, 43659..."
2,Sales Order,object,0,0.0,31455,"[SO51721, SO51739, SO51160, SO53465, SO47355, ..."
3,Sales Order Line,object,0,0.0,121253,"[SO75123 - 3, SO43659 - 1, SO43659 - 2, SO4365..."


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Analyzing DataFrame for sheet: Sales Territory_data


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,SalesTerritoryKey,int64,0,0.0,11,"[1, 2, 3, 4, 5, 6, 7]"
1,Region,object,0,0.0,11,"[Northwest, Northeast, Central, Southwest, Sou..."
2,Country,object,0,0.0,7,"[United States, Canada, France, Germany, Austr..."
3,Group,object,0,0.0,4,"[North America, Europe, Pacific, Corporate HQ]"


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Analyzing DataFrame for sheet: Sales_data


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,SalesOrderLineKey,int64,0,0.0,121253,"[75123003, 43659001, 43659002, 43659003, 43659..."
1,ResellerKey,int64,0,0.0,636,"[-1, 678, 54, 514, 118, 496, 175]"
2,CustomerKey,int64,0,0.0,18485,"[-1, 11185, 11300, 11277, 11262, 11287, 11176]"
3,ProductKey,int64,0,0.0,350,"[477, 480, 528, 225, 214, 222, 217]"
4,OrderDateKey,int64,0,0.0,1081,"[20200606, 20200615, 20190905, 20200602, 20200..."
5,DueDateKey,int64,0,0.0,1081,"[20200616, 20200625, 20190915, 20200612, 20200..."
6,ShipDateKey,float64,2113,1.742637,1074,"[20200613.0, 20190912.0, 20200609.0, 20200521...."
7,SalesTerritoryKey,int64,0,0.0,10,"[4, 6, 1, 9, 10, 7, 8]"
8,Order Quantity,int64,0,0.0,41,"[1, 2, 3, 4, 5, 6, 7]"
9,Unit Price,float64,0,0.0,275,"[4.99, 34.99, 8.99, 2.29, 469.794, 3.99, 419.4..."


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Analyzing DataFrame for sheet: Reseller_data


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,ResellerKey,int64,0,0.0,702,"[701, -1, 1, 2, 3, 4, 5]"
1,Reseller ID,object,0,0.0,702,"[AW00000701, [Not Applicable], AW00000001, AW0..."
2,Business Type,object,0,0.0,4,"[Value Added Reseller, Warehouse, Specialty Bi..."
3,Reseller,object,0,0.0,700,"[Friendly Bike Shop, Sports Products Store, Pr..."
4,City,object,0,0.0,452,"[Toronto, London, Paris, Montreal, Calgary, Mi..."
5,State-Province,object,0,0.0,66,"[California, Ontario, Washington, England, Tex..."
6,Country-Region,object,0,0.0,7,"[United States, Canada, France, Australia, Ger..."
7,Postal Code,object,0,0.0,502,"[T2P 2G8, 33127, M4B 1V7, M4B 1V5, 78204, H1Y ..."


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Analyzing DataFrame for sheet: Date_data


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,DateKey,int64,0,0.0,1461,"[20210630, 20170701, 20170702, 20170703, 20170..."
1,Date,datetime64[ns],0,0.0,1461,"[2021-06-30 00:00:00, 2017-07-01 00:00:00, 201..."
2,Fiscal Year,object,0,0.0,4,"[FY2020, FY2018, FY2019, FY2021]"
3,Fiscal Quarter,object,0,0.0,16,"[FY2018 Q1, FY2018 Q2, FY2019 Q2, FY2019 Q1, F..."
4,Month,object,0,0.0,48,"[2017 Jul, 2017 Aug, 2017 Oct, 2017 Dec, 2018 ..."
5,Full Date,object,0,0.0,1461,"[2021 Jun, 30, 2017 Jul, 01, 2017 Jul, 02, 201..."
6,MonthKey,int64,0,0.0,48,"[201707, 201708, 201710, 201712, 201801, 20180..."


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Analyzing DataFrame for sheet: Product_data


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,ProductKey,int64,0,0.0,397,"[606, 210, 211, 212, 213, 214, 215]"
1,SKU,object,0,0.0,295,"[FR-R38B-44, FR-R38B-58, FR-R38B-62, FR-R38B-6..."
2,Product,object,0,0.0,295,"[LL Road Frame - Black, 44, LL Road Frame - Bl..."
3,Standard Cost,float64,0,0.0,134,"[413.1463, 486.7066, 868.6342, 461.4448, 199.8..."
4,Color,object,56,14.105793,9,"[Black, Red, Yellow, Silver, Blue, Multi, Silv..."
5,List Price,float64,0,0.0,120,"[699.0982, 782.99, 337.22, 306.5636, 1431.5, 5..."
6,Model,object,0,0.0,119,"[LL Road Frame, HL Road Frame, Road-650, HL Mo..."
7,Subcategory,object,0,0.0,37,"[Road Frames, Road Bikes, Mountain Frames, Mou..."
8,Category,object,0,0.0,4,"[Components, Bikes, Clothing, Accessories]"


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Analyzing DataFrame for sheet: Customer_data


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,CustomerKey,int64,0,0.0,18485,"[29483, -1, 11000, 11001, 11002, 29467, 29466]"
1,Customer ID,object,0,0.0,18485,"[AW00029483, [Not Applicable], AW00011000, AW0..."
2,Customer,object,0,0.0,18401,"[Mohamed Pal, Alejandro Xu, Janet Torres, Kait..."
3,City,object,0,0.0,270,"[London, Paris, Burien, Concord, Bellingham, B..."
4,State-Province,object,0,0.0,54,"[California, Washington, England, British Colu..."
5,Country-Region,object,0,0.0,7,"[United States, Australia, United Kingdom, Fra..."
6,Postal Code,object,0,0.0,324,"[94519, 98168, 97005, 98225, 91910, 91950, V9]"


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


In [12]:
import pandas as pd

# 1. CSV 파일 로드
sales = pd.read_csv("/content/csv_sheets/Sales_data.csv")
sales_order = pd.read_csv("/content/csv_sheets/Sales Order_data.csv")
reseller = pd.read_csv("/content/csv_sheets/Reseller_data.csv")
product = pd.read_csv("/content/csv_sheets/Product_data.csv")
date = pd.read_csv("/content/csv_sheets/Date_data.csv")
customer = pd.read_csv("/content/csv_sheets/Customer_data.csv")
territory = pd.read_csv("/content/csv_sheets/Sales Territory_data.csv")

# 2. 병합 수행
merged = pd.merge(sales, sales_order, on='SalesOrderLineKey', how='left')
merged = pd.merge(merged, reseller, on='ResellerKey', how='left')
merged = pd.merge(merged, product, on='ProductKey', how='left')
merged = pd.merge(merged, date, left_on='OrderDateKey', right_on='DateKey', how='left')
merged = pd.merge(merged, customer, on='CustomerKey', how='left')
merged = pd.merge(merged, territory, on='SalesTerritoryKey', how='left')  # 중요 병합

# 3. 최종 결과 저장
# merged.to_csv("/content/csv_sheets/Merged_All_Data.csv", index=False)

merged

Unnamed: 0,SalesOrderLineKey,ResellerKey,CustomerKey,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,SalesTerritoryKey,Order Quantity,Unit Price,...,MonthKey,Customer ID,Customer,City_y,State-Province_y,Country-Region_y,Postal Code_y,Region,Country,Group
0,43659001,676,-1,349,20170702,20170712,20170709.0,5,1,2024.994,...,201707,[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],Southeast,United States,North America
1,43659002,676,-1,350,20170702,20170712,20170709.0,5,3,2024.994,...,201707,[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],Southeast,United States,North America
2,43659003,676,-1,351,20170702,20170712,20170709.0,5,1,2024.994,...,201707,[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],Southeast,United States,North America
3,43659004,676,-1,344,20170702,20170712,20170709.0,5,1,2039.994,...,201707,[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],Southeast,United States,North America
4,43659005,676,-1,345,20170702,20170712,20170709.0,5,1,2039.994,...,201707,[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],[Not Applicable],Southeast,United States,North America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121248,75122001,-1,15868,485,20200615,20200625,,6,1,21.980,...,202006,AW00015868,Caleb Lal,Sooke,British Columbia,Canada,V0,Canada,Canada,North America
121249,75122002,-1,15868,225,20200615,20200625,,6,1,8.990,...,202006,AW00015868,Caleb Lal,Sooke,British Columbia,Canada,V0,Canada,Canada,North America
121250,75123001,-1,18759,485,20200615,20200625,,6,1,21.980,...,202006,AW00018759,Devin Phillips,Sooke,British Columbia,Canada,V0,Canada,Canada,North America
121251,75123002,-1,18759,486,20200615,20200625,,6,1,159.000,...,202006,AW00018759,Devin Phillips,Sooke,British Columbia,Canada,V0,Canada,Canada,North America


In [13]:
merged.columns

Index(['SalesOrderLineKey', 'ResellerKey', 'CustomerKey', 'ProductKey',
       'OrderDateKey', 'DueDateKey', 'ShipDateKey', 'SalesTerritoryKey',
       'Order Quantity', 'Unit Price', 'Extended Amount',
       'Unit Price Discount Pct', 'Product Standard Cost',
       'Total Product Cost', 'Sales Amount', 'Channel', 'Sales Order',
       'Sales Order Line', 'Reseller ID', 'Business Type', 'Reseller',
       'City_x', 'State-Province_x', 'Country-Region_x', 'Postal Code_x',
       'SKU', 'Product', 'Standard Cost', 'Color', 'List Price', 'Model',
       'Subcategory', 'Category', 'DateKey', 'Date', 'Fiscal Year',
       'Fiscal Quarter', 'Month', 'Full Date', 'MonthKey', 'Customer ID',
       'Customer', 'City_y', 'State-Province_y', 'Country-Region_y',
       'Postal Code_y', 'Region', 'Country', 'Group'],
      dtype='object')

In [15]:
# prompt: merged_df의 Extended Amount, Unit Price Discount Pct 를 drop하라

merged_df = merged.drop(columns=['Extended Amount', 'Unit Price Discount Pct'])

In [18]:
# prompt: merged_df의 컬럼명이 "_y"로 끝나는 컬럼은 drop하고 "_x"로 끝나는 컬럼명은  "_x" 글자를 제거하라

# 컬럼 정리 (suffix 제거)
cols_to_drop = [col for col in merged_df.columns if col.endswith('_y')]
cols_to_rename = {col: col.replace('_x', '') for col in merged_df.columns if col.endswith('_x')}

merged_df = merged_df.drop(columns=cols_to_drop)
merged_df = merged_df.rename(columns=cols_to_rename)

summary(merged_df).sort_values(by='Column', ascending=False)

Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
9,Unit Price,float64,0,0.0,275,"[4.99, 34.99, 8.99, 2.29, 469.794, 3.99, 419.4..."
11,Total Product Cost,float64,0,0.0,1453,"[1.8663, 13.0863, 0.8565, 1.4923, 3.3623, 6.92..."
29,Subcategory,object,0,0.0,35,"[Road Bikes, Tires and Tubes, Mountain Bikes, ..."
20,State-Province,object,0,0.0,65,"[[Not Applicable], California, Ontario, Texas,..."
25,Standard Cost,float64,0,0.0,125,"[1.8663, 13.0863, 486.7066, 0.8565, 1481.9379,..."
6,ShipDateKey,float64,2113,1.742637,1074,"[20200613.0, 20190912.0, 20200609.0, 20200521...."
7,SalesTerritoryKey,int64,0,0.0,10,"[4, 6, 1, 9, 10, 7, 8]"
0,SalesOrderLineKey,int64,0,0.0,121253,"[75123003, 43659001, 43659002, 43659003, 43659..."
15,Sales Order Line,object,0,0.0,121253,"[SO75123 - 3, SO43659 - 1, SO43659 - 2, SO4365..."
14,Sales Order,object,0,0.0,31455,"[SO51721, SO51739, SO51160, SO53465, SO47355, ..."


In [23]:
# prompt: merged_df를 11개 컬럼으로 순회하면서 보여줘

# Iterate through columns in chunks of 15
num_cols = len(merged_df.columns)
for i in range(0, num_cols, 11):
    cols_to_show = merged_df.columns[i:i+11]
    display(merged_df[cols_to_show].head())
    print("-" * 100) # Separator for better readability

Unnamed: 0,SalesOrderLineKey,ResellerKey,CustomerKey,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,SalesTerritoryKey,Order Quantity,Unit Price,Product Standard Cost
0,43659001,676,-1,349,20170702,20170712,20170709.0,5,1,2024.994,1898.0944
1,43659002,676,-1,350,20170702,20170712,20170709.0,5,3,2024.994,1898.0944
2,43659003,676,-1,351,20170702,20170712,20170709.0,5,1,2024.994,1898.0944
3,43659004,676,-1,344,20170702,20170712,20170709.0,5,1,2039.994,1912.1544
4,43659005,676,-1,345,20170702,20170712,20170709.0,5,1,2039.994,1912.1544


----------------------------------------------------------------------------------------------------


Unnamed: 0,Total Product Cost,Sales Amount,Channel,Sales Order,Sales Order Line,Reseller ID,Business Type,Reseller,City,State-Province,Country-Region
0,1898.0944,2024.994,Reseller,SO43659,SO43659 - 1,AW00000676,Value Added Reseller,Better Bike Shop,Austell,Georgia,United States
1,5694.2832,6074.982,Reseller,SO43659,SO43659 - 2,AW00000676,Value Added Reseller,Better Bike Shop,Austell,Georgia,United States
2,1898.0944,2024.994,Reseller,SO43659,SO43659 - 3,AW00000676,Value Added Reseller,Better Bike Shop,Austell,Georgia,United States
3,1912.1544,2039.994,Reseller,SO43659,SO43659 - 4,AW00000676,Value Added Reseller,Better Bike Shop,Austell,Georgia,United States
4,1912.1544,2039.994,Reseller,SO43659,SO43659 - 5,AW00000676,Value Added Reseller,Better Bike Shop,Austell,Georgia,United States


----------------------------------------------------------------------------------------------------


Unnamed: 0,Postal Code,SKU,Product,Standard Cost,Color,List Price,Model,Subcategory,Category,DateKey,Date
0,30106,BK-M82B-42,"Mountain-100 Black, 42",1898.0944,Black,3374.99,Mountain-100,Mountain Bikes,Bikes,20170702,2017-07-02
1,30106,BK-M82B-44,"Mountain-100 Black, 44",1898.0944,Black,3374.99,Mountain-100,Mountain Bikes,Bikes,20170702,2017-07-02
2,30106,BK-M82B-48,"Mountain-100 Black, 48",1898.0944,Black,3374.99,Mountain-100,Mountain Bikes,Bikes,20170702,2017-07-02
3,30106,BK-M82S-38,"Mountain-100 Silver, 38",1912.1544,Silver,3399.99,Mountain-100,Mountain Bikes,Bikes,20170702,2017-07-02
4,30106,BK-M82S-42,"Mountain-100 Silver, 42",1912.1544,Silver,3399.99,Mountain-100,Mountain Bikes,Bikes,20170702,2017-07-02


----------------------------------------------------------------------------------------------------


Unnamed: 0,Fiscal Year,Fiscal Quarter,Month,Full Date,MonthKey,Customer ID,Customer,Region,Country,Group
0,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",201707,[Not Applicable],[Not Applicable],Southeast,United States,North America
1,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",201707,[Not Applicable],[Not Applicable],Southeast,United States,North America
2,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",201707,[Not Applicable],[Not Applicable],Southeast,United States,North America
3,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",201707,[Not Applicable],[Not Applicable],Southeast,United States,North America
4,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",201707,[Not Applicable],[Not Applicable],Southeast,United States,North America


----------------------------------------------------------------------------------------------------


In [24]:
# prompt: merged_df의 컬럼명에 Key 포함된 컬럼을 리스트로

key_columns = [col for col in merged_df.columns if 'Key' in col]
key_columns

['SalesOrderLineKey',
 'ResellerKey',
 'CustomerKey',
 'ProductKey',
 'OrderDateKey',
 'DueDateKey',
 'ShipDateKey',
 'SalesTerritoryKey',
 'DateKey',
 'MonthKey']

In [26]:
merged_df.columns

Index(['SalesOrderLineKey', 'ResellerKey', 'CustomerKey', 'ProductKey',
       'OrderDateKey', 'DueDateKey', 'ShipDateKey', 'SalesTerritoryKey',
       'Order Quantity', 'Unit Price', 'Product Standard Cost',
       'Total Product Cost', 'Sales Amount', 'Channel', 'Sales Order',
       'Sales Order Line', 'Reseller ID', 'Business Type', 'Reseller', 'City',
       'State-Province', 'Country-Region', 'Postal Code', 'SKU', 'Product',
       'Standard Cost', 'Color', 'List Price', 'Model', 'Subcategory',
       'Category', 'DateKey', 'Date', 'Fiscal Year', 'Fiscal Quarter', 'Month',
       'Full Date', 'MonthKey', 'Customer ID', 'Customer', 'Region', 'Country',
       'Group'],
      dtype='object')

In [29]:
matched_groups = [
    ['SalesOrderLineKey', 'Sales Order Line'],
    ['ResellerKey', 'Reseller ID'],
    ['CustomerKey', 'Customer ID'],
    ['ProductKey', 'SKU'],  # SKU = Stock Keeping Unit (제품 식별자)
    ['OrderDateKey'],
    ['DueDateKey'],
    ['ShipDateKey'],
    ['SalesTerritoryKey', 'Region'],  # 같은 맥락에서 지역 코드와 명칭일 수 있음
    ['DateKey', 'Date'],
    ['MonthKey', 'Month']
]


In [30]:
# prompt: matched_groups의 원소 리스트 별로 데이터프레임을 확ㅇ니하고 동일한 값을 서로 가졌는지 체크해줘

import pandas as pd
def check_group_consistency(df, group):
    """
    Checks if columns within a group in a DataFrame have consistent values.
    Assumes the first column in the group is the 'key' column and other columns
    should have the same value for each unique key.

    Args:
        df (pd.DataFrame): The input DataFrame.
        group (list): A list of column names representing the group.

    Returns:
        dict: A dictionary where keys are the key column values and values
              are dictionaries containing inconsistent columns and their
              unique values for that key. Returns an empty dictionary if all
              columns within the group are consistent for each key.
    """
    if not group or len(group) < 2:
        print(f"Group {group} has less than 2 columns. Skipping consistency check.")
        return {}

    key_col = group[0]
    other_cols = group[1:]

    if key_col not in df.columns:
        print(f"Key column '{key_col}' not found in the DataFrame. Skipping group {group}.")
        return {}

    inconsistent_data = {}

    # Group by the key column and check uniqueness of other columns
    grouped = df.groupby(key_col)
    for key_value, group_df in grouped:
        key_inconsistencies = {}
        for col in other_cols:
            if col not in group_df.columns:
                 print(f"Column '{col}' not found in the DataFrame for group {group}.")
                 continue

            unique_values = group_df[col].unique()
            if len(unique_values) > 1:
                key_inconsistencies[col] = unique_values.tolist()

        if key_inconsistencies:
            inconsistent_data[key_value] = key_inconsistencies

    return inconsistent_data

# Iterate through matched_groups and check consistency for each group in the merged_df
for group in matched_groups:
    print(f"\nChecking consistency for group: {group}")
    inconsistencies = check_group_consistency(merged_df, group)

    if not inconsistencies:
        print(f"All columns in group {group} are consistent for each unique {group[0]}.")
    else:
        print(f"Inconsistencies found in group {group}:")
        for key_value, cols_data in inconsistencies.items():
            print(f"  For {group[0]} = {key_value}:")
            for col, unique_vals in cols_data.items():
                print(f"    Column '{col}' has unique values: {unique_vals}")


Checking consistency for group: ['SalesOrderLineKey', 'Sales Order Line']
All columns in group ['SalesOrderLineKey', 'Sales Order Line'] are consistent for each unique SalesOrderLineKey.

Checking consistency for group: ['ResellerKey', 'Reseller ID']
All columns in group ['ResellerKey', 'Reseller ID'] are consistent for each unique ResellerKey.

Checking consistency for group: ['CustomerKey', 'Customer ID']
All columns in group ['CustomerKey', 'Customer ID'] are consistent for each unique CustomerKey.

Checking consistency for group: ['ProductKey', 'SKU']
All columns in group ['ProductKey', 'SKU'] are consistent for each unique ProductKey.

Checking consistency for group: ['OrderDateKey']
Group ['OrderDateKey'] has less than 2 columns. Skipping consistency check.
All columns in group ['OrderDateKey'] are consistent for each unique OrderDateKey.

Checking consistency for group: ['DueDateKey']
Group ['DueDateKey'] has less than 2 columns. Skipping consistency check.
All columns in group

In [31]:
# prompt: merged_df에서 key_columns를 제외한 컬럼만 df로 만들어줘

df = merged_df.drop(columns=key_columns)
df.head()


Unnamed: 0,Order Quantity,Unit Price,Product Standard Cost,Total Product Cost,Sales Amount,Channel,Sales Order,Sales Order Line,Reseller ID,Business Type,...,Date,Fiscal Year,Fiscal Quarter,Month,Full Date,Customer ID,Customer,Region,Country,Group
0,1,2024.994,1898.0944,1898.0944,2024.994,Reseller,SO43659,SO43659 - 1,AW00000676,Value Added Reseller,...,2017-07-02,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",[Not Applicable],[Not Applicable],Southeast,United States,North America
1,3,2024.994,1898.0944,5694.2832,6074.982,Reseller,SO43659,SO43659 - 2,AW00000676,Value Added Reseller,...,2017-07-02,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",[Not Applicable],[Not Applicable],Southeast,United States,North America
2,1,2024.994,1898.0944,1898.0944,2024.994,Reseller,SO43659,SO43659 - 3,AW00000676,Value Added Reseller,...,2017-07-02,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",[Not Applicable],[Not Applicable],Southeast,United States,North America
3,1,2039.994,1912.1544,1912.1544,2039.994,Reseller,SO43659,SO43659 - 4,AW00000676,Value Added Reseller,...,2017-07-02,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",[Not Applicable],[Not Applicable],Southeast,United States,North America
4,1,2039.994,1912.1544,1912.1544,2039.994,Reseller,SO43659,SO43659 - 5,AW00000676,Value Added Reseller,...,2017-07-02,FY2018,FY2018 Q1,2017 Jul,"2017 Jul, 02",[Not Applicable],[Not Applicable],Southeast,United States,North America


In [32]:
summary(df)

Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,Order Quantity,int64,0,0.0,41,"[1, 2, 3, 4, 5, 6, 7]"
1,Unit Price,float64,0,0.0,275,"[4.99, 34.99, 8.99, 2.29, 469.794, 3.99, 419.4..."
2,Product Standard Cost,float64,0,0.0,125,"[1.8663, 13.0863, 486.7066, 0.8565, 1481.9379,..."
3,Total Product Cost,float64,0,0.0,1453,"[1.8663, 13.0863, 0.8565, 1.4923, 3.3623, 6.92..."
4,Sales Amount,float64,0,0.0,1464,"[4.99, 34.99, 8.99, 2.29, 3.99, 21.98, 9.99]"
5,Channel,object,0,0.0,2,"[Reseller, Internet]"
6,Sales Order,object,0,0.0,31455,"[SO51721, SO51739, SO51160, SO53465, SO47355, ..."
7,Sales Order Line,object,0,0.0,121253,"[SO75123 - 3, SO43659 - 1, SO43659 - 2, SO4365..."
8,Reseller ID,object,0,0.0,636,"[[Not Applicable], AW00000678, AW00000054, AW0..."
9,Business Type,object,0,0.0,4,"[[Not Applicable], Warehouse, Value Added Rese..."


In [33]:
# prompt: Fiscal Year, Fiscal Quarter, Month, Customer ID, Reseller ID 는 모두 drop하라

df = df.drop(columns=['Fiscal Year', 'Fiscal Quarter', 'Month', 'Customer ID', 'Reseller ID'])
df.head()

Unnamed: 0,Order Quantity,Unit Price,Product Standard Cost,Total Product Cost,Sales Amount,Channel,Sales Order,Sales Order Line,Business Type,Reseller,...,List Price,Model,Subcategory,Category,Date,Full Date,Customer,Region,Country,Group
0,1,2024.994,1898.0944,1898.0944,2024.994,Reseller,SO43659,SO43659 - 1,Value Added Reseller,Better Bike Shop,...,3374.99,Mountain-100,Mountain Bikes,Bikes,2017-07-02,"2017 Jul, 02",[Not Applicable],Southeast,United States,North America
1,3,2024.994,1898.0944,5694.2832,6074.982,Reseller,SO43659,SO43659 - 2,Value Added Reseller,Better Bike Shop,...,3374.99,Mountain-100,Mountain Bikes,Bikes,2017-07-02,"2017 Jul, 02",[Not Applicable],Southeast,United States,North America
2,1,2024.994,1898.0944,1898.0944,2024.994,Reseller,SO43659,SO43659 - 3,Value Added Reseller,Better Bike Shop,...,3374.99,Mountain-100,Mountain Bikes,Bikes,2017-07-02,"2017 Jul, 02",[Not Applicable],Southeast,United States,North America
3,1,2039.994,1912.1544,1912.1544,2039.994,Reseller,SO43659,SO43659 - 4,Value Added Reseller,Better Bike Shop,...,3399.99,Mountain-100,Mountain Bikes,Bikes,2017-07-02,"2017 Jul, 02",[Not Applicable],Southeast,United States,North America
4,1,2039.994,1912.1544,1912.1544,2039.994,Reseller,SO43659,SO43659 - 5,Value Added Reseller,Better Bike Shop,...,3399.99,Mountain-100,Mountain Bikes,Bikes,2017-07-02,"2017 Jul, 02",[Not Applicable],Southeast,United States,North America


In [34]:
summary(df)

Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,Order Quantity,int64,0,0.0,41,"[1, 2, 3, 4, 5, 6, 7]"
1,Unit Price,float64,0,0.0,275,"[4.99, 34.99, 8.99, 2.29, 469.794, 3.99, 419.4..."
2,Product Standard Cost,float64,0,0.0,125,"[1.8663, 13.0863, 486.7066, 0.8565, 1481.9379,..."
3,Total Product Cost,float64,0,0.0,1453,"[1.8663, 13.0863, 0.8565, 1.4923, 3.3623, 6.92..."
4,Sales Amount,float64,0,0.0,1464,"[4.99, 34.99, 8.99, 2.29, 3.99, 21.98, 9.99]"
5,Channel,object,0,0.0,2,"[Reseller, Internet]"
6,Sales Order,object,0,0.0,31455,"[SO51721, SO51739, SO51160, SO53465, SO47355, ..."
7,Sales Order Line,object,0,0.0,121253,"[SO75123 - 3, SO43659 - 1, SO43659 - 2, SO4365..."
8,Business Type,object,0,0.0,4,"[[Not Applicable], Warehouse, Value Added Rese..."
9,Reseller,object,0,0.0,634,"[[Not Applicable], Vigorous Exercise Company, ..."


In [37]:
# prompt: df['Date']를 날짜로 변경하라

df['Date'] = pd.to_datetime(df['Date'])


In [38]:
# prompt: object 형태 자료는 category로 변환

# Select columns with 'object' dtype
object_cols = df.select_dtypes(include='object').columns

# Convert selected columns to 'category' dtype
for col in object_cols:
    df[col] = df[col].astype('category')

summary(df)

Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,Order Quantity,int64,0,0.0,41,"[1, 2, 3, 4, 5, 6, 7]"
1,Unit Price,float64,0,0.0,275,"[4.99, 34.99, 8.99, 2.29, 469.794, 3.99, 419.4..."
2,Product Standard Cost,float64,0,0.0,125,"[1.8663, 13.0863, 486.7066, 0.8565, 1481.9379,..."
3,Total Product Cost,float64,0,0.0,1453,"[1.8663, 13.0863, 0.8565, 1.4923, 3.3623, 6.92..."
4,Sales Amount,float64,0,0.0,1464,"[4.99, 34.99, 8.99, 2.29, 3.99, 21.98, 9.99]"
5,Channel,category,0,0.0,2,"[Reseller, Internet]"
6,Sales Order,category,0,0.0,31455,"[SO51739, SO51721, SO51160, SO53465, SO47355, ..."
7,Sales Order Line,category,0,0.0,121253,"[SO75123 - 3, SO43659 - 1, SO43659 - 10, SO436..."
8,Business Type,category,0,0.0,4,"[[Not Applicable], Warehouse, Value Added Rese..."
9,Reseller,category,0,0.0,634,"[[Not Applicable], Vigorous Exercise Company, ..."


In [40]:
# prompt: [Not Applicable] 이 포함된 값은 모두 'missing'으로 변환하라

df.replace('[Not Applicable]', 'Missing', inplace=True)
summary(df)

  df.replace('[Not Applicable]', 'Missing', inplace=True)


Unnamed: 0,Column,DataType,MissingCount,missing_percentage,UniqueCount,Top7Values
0,Order Quantity,int64,0,0.0,41,"[1, 2, 3, 4, 5, 6, 7]"
1,Unit Price,float64,0,0.0,275,"[4.99, 34.99, 8.99, 2.29, 469.794, 3.99, 419.4..."
2,Product Standard Cost,float64,0,0.0,125,"[1.8663, 13.0863, 486.7066, 0.8565, 1481.9379,..."
3,Total Product Cost,float64,0,0.0,1453,"[1.8663, 13.0863, 0.8565, 1.4923, 3.3623, 6.92..."
4,Sales Amount,float64,0,0.0,1464,"[4.99, 34.99, 8.99, 2.29, 3.99, 21.98, 9.99]"
5,Channel,category,0,0.0,2,"[Reseller, Internet]"
6,Sales Order,category,0,0.0,31455,"[SO51739, SO51721, SO51160, SO53465, SO47355, ..."
7,Sales Order Line,category,0,0.0,121253,"[SO75123 - 3, SO43659 - 1, SO43659 - 10, SO436..."
8,Business Type,category,0,0.0,4,"[Missing, Warehouse, Value Added Reseller, Spe..."
9,Reseller,category,0,0.0,634,"[Missing, Vigorous Exercise Company, Larger Cy..."


In [41]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [42]:
# prompt: /content/drive/MyDrive/01_남양주미래기술학교/자료  에 df를 저장해줘

output_drive_path = '/content/drive/MyDrive/01_남양주미래기술학교/자료'
os.makedirs(output_drive_path, exist_ok=True)

# Save the final processed DataFrame to the specified drive path
df.to_csv(os.path.join(output_drive_path, "processed_adventureworks_sales_data.csv"), index=False)
print(f"DataFrame saved to: {os.path.join(output_drive_path, 'processed_adventureworks_sales_data.csv')}")


DataFrame saved to: /content/drive/MyDrive/01_남양주미래기술학교/자료/processed_adventureworks_sales_data.csv


## **EDA, 시각화, RFM 분석**