# **[Tutorial: From Excel workbook to a Power BI report in Microsoft Teams](https://learn.microsoft.com/en-us/power-bi/create-reports/service-from-excel-to-stunning-report)**

<img src='https://learn.microsoft.com/en-us/power-bi/create-reports/media/service-from-excel-to-stunning-report/power-bi-financial-report-service.png'>

### **[Sample data? Download here](https://github.com/microsoft/powerbi-desktop-samples/blob/main/AdventureWorks%20Sales%20Sample/AdventureWorks%20Sales.xlsx)**

>> #### 3. Window 환경에세 python으로 다운 받는 방법

In [1]:
## LInux가 아닌 Window 환경에서 다운받는 법
import subprocess

# 다운로드할 파일의 URL
url = "https://github.com/microsoft/powerbi-desktop-samples/raw/main/AdventureWorks%20Sales%20Sample/AdventureWorks%20Sales.xlsx"

# 다운로드 명령어 실행
subprocess.run(["wget", url])

CompletedProcess(args=['wget', 'https://github.com/microsoft/powerbi-desktop-samples/raw/main/AdventureWorks%20Sales%20Sample/AdventureWorks%20Sales.xlsx'], returncode=0)

In [3]:
import pandas as pd
dfs = pd.read_excel('/content/AdventureWorks Sales.xlsx', sheet_name=None)

In [4]:
dfs.keys()

dict_keys(['Sales Order_data', 'Sales Territory_data', 'Sales_data', 'Reseller_data', 'Date_data', 'Product_data', 'Customer_data'])

<img src='https://miro.medium.com/v2/resize:fit:1100/format:webp/0*9UNgxNIu8-HgaxLa.png'>

## **데이터전처리**

In [5]:
def clean_data(dfs):
    dfs1 = {}
    for sheet_name, df in dfs.items():
        # Remove duplicates>>Handle missing values>>drop('[Not Applicable]'
        df = df.drop_duplicates().replace('[Not Applicable]', pd.NA).dropna()
        dfs1[sheet_name] = df

    return dfs1

In [9]:
def eda(df):
    return pd.DataFrame({'자료 내용(contents)':{col:df[col].unique() for col in df},
                '데이터형태(dtypes)':{col:df[col].dtype for col in df},
                '고유값 수(nunique)':{col:len(df[col].unique()) for col in df},
                '결측치 비율(%)':{col:str(round(sum(df[col].isna())/len(df),2))+'%' for col in df},
                'nan 비율':{col:int(df[df[col] == 'nan'].shape[0]/len(df)*100) for col in df}
                })

In [10]:
dfs1 = clean_data(dfs)

In [13]:
for k, v in  dfs1.items():
    display(eda(v))

Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율
Channel,"[Reseller, Internet]",object,2,0.0%,0
SalesOrderLineKey,"[43659001, 43659002, 43659003, 43659004, 43659...",int64,121253,0.0%,0
Sales Order,"[SO43659, SO43660, SO43661, SO43662, SO43663, ...",object,31455,0.0%,0
Sales Order Line,"[SO43659 - 1, SO43659 - 2, SO43659 - 3, SO4365...",object,121253,0.0%,0


Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율
SalesTerritoryKey,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",int64,11,0.0%,0
Region,"[Northwest, Northeast, Central, Southwest, Sou...",object,11,0.0%,0
Country,"[United States, Canada, France, Germany, Austr...",object,7,0.0%,0
Group,"[North America, Europe, Pacific, Corporate HQ]",object,4,0.0%,0


Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율
SalesOrderLineKey,"[43659001, 43659002, 43659003, 43659004, 43659...",int64,119140,0.0%,0
ResellerKey,"[676, 117, 442, 227, 510, 397, 146, 511, 646, ...",int64,633,0.0%,0
CustomerKey,"[-1, 21768, 28389, 25863, 14501, 11003, 27645,...",int64,18227,0.0%,0
ProductKey,"[349, 350, 351, 344, 345, 346, 347, 229, 235, ...",int64,350,0.0%,0
OrderDateKey,"[20170702, 20170705, 20170707, 20170709, 20170...",int64,1074,0.0%,0
DueDateKey,"[20170712, 20170715, 20170717, 20170719, 20170...",int64,1074,0.0%,0
ShipDateKey,"[20170709.0, 20170712.0, 20170714.0, 20170716....",float64,1074,0.0%,0
SalesTerritoryKey,"[5, 6, 4, 1, 3, 2, 10, 7, 9, 8]",int64,10,0.0%,0
Order Quantity,"[1, 3, 2, 6, 4, 5, 7, 9, 8, 10, 12, 13, 21, 14...",int64,41,0.0%,0
Unit Price,"[2024.994, 2039.994, 28.8404, 5.7, 5.1865, 20....",float64,275,0.0%,0


Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율
ResellerKey,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",int64,701,0.0%,0
Reseller ID,"[AW00000001, AW00000002, AW00000003, AW0000000...",object,701,0.0%,0
Business Type,"[Value Added Reseller, Specialty Bike Shop, Wa...",object,3,0.0%,0
Reseller,"[A Bike Store, Progressive Sports, Advanced Bi...",object,699,0.0%,0
City,"[Seattle, Renton, Irving, Austin, Fremont, Cam...",object,451,0.0%,0
State-Province,"[Washington, Texas, California, Utah, Florida,...",object,65,0.0%,0
Country-Region,"[United States, Canada, France, Germany, Austr...",object,6,0.0%,0
Postal Code,"[98104, 98055, 75061, 78701, 94536, 93010, 841...",object,501,0.0%,0


Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율
DateKey,"[20170701, 20170702, 20170703, 20170704, 20170...",int64,1461,0.0%,0
Date,"[2017-07-01 00:00:00, 2017-07-02 00:00:00, 201...",datetime64[ns],1461,0.0%,0
Fiscal Year,"[FY2018, FY2019, FY2020, FY2021]",object,4,0.0%,0
Fiscal Quarter,"[FY2018 Q1, FY2018 Q2, FY2018 Q3, FY2018 Q4, F...",object,16,0.0%,0
Month,"[2017 Jul, 2017 Aug, 2017 Sep, 2017 Oct, 2017 ...",object,48,0.0%,0
Full Date,"[2017 Jul, 01, 2017 Jul, 02, 2017 Jul, 03, 201...",object,1461,0.0%,0
MonthKey,"[201707, 201708, 201709, 201710, 201711, 20171...",int64,48,0.0%,0


Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율
ProductKey,"[210, 211, 212, 213, 214, 215, 216, 217, 218, ...",int64,341,0.0%,0
SKU,"[FR-R92B-58, FR-R92R-58, HL-U509-R, HL-U509, S...",object,245,0.0%,0
Product,"[HL Road Frame - Black, 58, HL Road Frame - Re...",object,245,0.0%,0
Standard Cost,"[868.6342, 12.0278, 13.8782, 13.0863, 3.3963, ...",float64,95,0.0%,0
Color,"[Black, Red, White, Blue, Multi, Silver, Yello...",object,9,0.0%,0
List Price,"[1431.5, 33.6442, 34.99, 9.5, 8.6442, 8.99, 48...",float64,84,0.0%,0
Model,"[HL Road Frame, Sport-100, Mountain Bike Socks...",object,69,0.0%,0
Subcategory,"[Road Frames, Helmets, Socks, Caps, Jerseys, M...",object,23,0.0%,0
Category,"[Components, Accessories, Clothing, Bikes]",object,4,0.0%,0


Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율
CustomerKey,"[11000, 11001, 11002, 11003, 11004, 11005, 110...",int64,18484,0.0%,0
Customer ID,"[AW00011000, AW00011001, AW00011002, AW0001100...",object,18484,0.0%,0
Customer,"[Jon Yang, Eugene Huang, Ruben Torres, Christy...",object,18400,0.0%,0
City,"[Rockhampton, Seaford, Hobart, North Ryde, Wol...",object,269,0.0%,0
State-Province,"[Queensland, Victoria, Tasmania, New South Wal...",object,53,0.0%,0
Country-Region,"[Australia, United States, Canada, Germany, Un...",object,6,0.0%,0
Postal Code,"[4700, 3198, 7001, 2113, 2500, 4169, 2036, 328...",object,323,0.0%,0


In [56]:
# prompt: eda(v)파일을 모두 append하기 단 eda(v)파일에 k이름이 각 데이터프레임의 마지막 컬럼에 'filename'으로
dfs1 = clean_data(dfs)
dfs1_eda = []
for k, v in dfs1.items():
  dfs1_eda.append(eda(v))
  dfs1_eda[-1]['filename'] = k

dfs1_eda = pd.concat(dfs1_eda)
dfs1_eda.reset_index().rename(columns={'index': '컬럼명(field)'}).sort_values('컬럼명(field)')

Unnamed: 0,컬럼명(field),자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율,filename
25,Business Type,"[Value Added Reseller, Specialty Bike Shop, Wa...",object,3,0.0%,0,Reseller_data
46,Category,"[Components, Accessories, Clothing, Bikes]",object,4,0.0%,0,Product_data
0,Channel,"[Reseller, Internet]",object,2,0.0%,0,Sales Order_data
27,City,"[Seattle, Renton, Irving, Austin, Fremont, Cam...",object,451,0.0%,0,Reseller_data
50,City,"[Rockhampton, Seaford, Hobart, North Ryde, Wol...",object,269,0.0%,0,Customer_data
42,Color,"[Black, Red, White, Blue, Multi, Silver, Yello...",object,9,0.0%,0,Product_data
6,Country,"[United States, Canada, France, Germany, Austr...",object,7,0.0%,0,Sales Territory_data
29,Country-Region,"[United States, Canada, France, Germany, Austr...",object,6,0.0%,0,Reseller_data
52,Country-Region,"[Australia, United States, Canada, Germany, Un...",object,6,0.0%,0,Customer_data
49,Customer,"[Jon Yang, Eugene Huang, Ruben Torres, Christy...",object,18400,0.0%,0,Customer_data


In [51]:
# 원래데이터에서 확인
# data[data['컬럼명(field)'] == 'State-Province']

# 어떤 테이블의 field 명이 동일한지 확인
data = dfs1_eda.reset_index().rename(columns={'index': '컬럼명(field)'}).sort_values(by='컬럼명(field)')
data[data['컬럼명(field)'].duplicated(keep=False)]

Unnamed: 0,컬럼명(field),자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율,filename
27,City,"[Seattle, Renton, Irving, Austin, Fremont, Cam...",object,451,0.0%,0,Reseller_data
50,City,"[Rockhampton, Seaford, Hobart, North Ryde, Wol...",object,269,0.0%,0,Customer_data
29,Country-Region,"[United States, Canada, France, Germany, Austr...",object,6,0.0%,0,Reseller_data
52,Country-Region,"[Australia, United States, Canada, Germany, Un...",object,6,0.0%,0,Customer_data
47,CustomerKey,"[11000, 11001, 11002, 11003, 11004, 11005, 110...",int64,18484,0.0%,0,Customer_data
10,CustomerKey,"[-1, 21768, 28389, 25863, 14501, 11003, 27645,...",int64,18227,0.0%,0,Sales_data
30,Postal Code,"[98104, 98055, 75061, 78701, 94536, 93010, 841...",object,501,0.0%,0,Reseller_data
53,Postal Code,"[4700, 3198, 7001, 2113, 2500, 4169, 2036, 328...",object,323,0.0%,0,Customer_data
11,ProductKey,"[349, 350, 351, 344, 345, 346, 347, 229, 235, ...",int64,350,0.0%,0,Sales_data
38,ProductKey,"[210, 211, 212, 213, 214, 215, 216, 217, 218, ...",int64,341,0.0%,0,Product_data


In [75]:
# Customer_data와 Sales_data 테이블의 공통컬럼인 "CustomerKey" ghkrdls
print(len(dfs1['Customer_data']['CustomerKey'].unique()))
CustomerKey = (set(dfs1['Customer_data']['CustomerKey'].unique()) & set(dfs1['Sales_data']['CustomerKey'].unique()))
print(len(CustomerKey))

18484
18226


In [78]:
dfs1_eda

Unnamed: 0,자료 내용(contents),데이터형태(dtypes),고유값 수(nunique),결측치 비율(%),nan 비율,filename
Channel,"[Reseller, Internet]",object,2,0.0%,0,Sales Order_data
SalesOrderLineKey,"[43659001, 43659002, 43659003, 43659004, 43659...",int64,121253,0.0%,0,Sales Order_data
Sales Order,"[SO43659, SO43660, SO43661, SO43662, SO43663, ...",object,31455,0.0%,0,Sales Order_data
Sales Order Line,"[SO43659 - 1, SO43659 - 2, SO43659 - 3, SO4365...",object,121253,0.0%,0,Sales Order_data
SalesTerritoryKey,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",int64,11,0.0%,0,Sales Territory_data
Region,"[Northwest, Northeast, Central, Southwest, Sou...",object,11,0.0%,0,Sales Territory_data
Country,"[United States, Canada, France, Germany, Austr...",object,7,0.0%,0,Sales Territory_data
Group,"[North America, Europe, Pacific, Corporate HQ]",object,4,0.0%,0,Sales Territory_data
SalesOrderLineKey,"[43659001, 43659002, 43659003, 43659004, 43659...",int64,119140,0.0%,0,Sales_data
ResellerKey,"[676, 117, 442, 227, 510, 397, 146, 511, 646, ...",int64,633,0.0%,0,Sales_data


## **Practice**

### **[Adventure Works 2022 CSVs](https://www.kaggle.com/datasets/algorismus/adventure-works-in-excel-tables?select=Product.csv)**

> #### ANALYSIS OF ADVENTURE WORKS SALES PERFORMANCE: POWER BI

## <font color='blue'> **1. Sales Overview**
<img src='https://miro.medium.com/v2/resize:fit:1100/format:webp/0*jJu_KSKCezRX9Of6.png'>

## <font color='blue'> **2. Customer Details**
<img src='https://miro.medium.com/v2/resize:fit:1100/format:webp/0*78zgpVNLKw3EGobX.png'>

## <font color='blue'> **3. Product Details**
<img src='https://miro.medium.com/v2/resize:fit:1400/format:webp/0*Ljg3YxEaKGAgfFV6.png'>

## <font color='blue'> **4. Sales Map**
<img src='https://miro.medium.com/v2/resize:fit:1100/format:webp/0*eXO485qjTIBD9qRx.png'>