![image-7.png](attachment:image-7.png)![image-9.png](attachment:image-9.png)![image-10.png](attachment:image-10.png)

In [0]:
import os
import pandas as pd

# --- Volume 配置 (与 CELL 1/2 保持一致) ---
CATALOG_NAME = "workspace"
SCHEMA_NAME = "default"
VOLUME_NAME = "course_data"

# 文件路径应该位于 Volume 中 'BDA2_Data' 文件夹下
VOLUME_DATA_FOLDER = "BDA2_Data" 

# 1. 构造完整的 Volume 路径
volume_base_path = f"/Volumes/{CATALOG_NAME}/{SCHEMA_NAME}/{VOLUME_NAME}/{VOLUME_DATA_FOLDER}"

# 2. 假设你要读取 BDA1/data 文件夹中的某个文件，例如 'some_data_file.csv'
# 假设我们要读取的是 'ab_data.csv' (或者你需要替换成 BDA1 实际的文件名)
file_name = "ab_data.csv" 
file_path = os.path.join(volume_base_path, file_name)

print(f"尝试读取路径: {file_path}")

# 3. 读取 CSV 文件
try:
    df = pd.read_csv(file_path)
    print("✅ 数据读取成功！")
    print(df.head())
except FileNotFoundError:
    print("❌ 错误：文件未找到。请检查 BDA1 文件夹中是否存在该文件，或重新运行 CELL 1 确保数据已部署。")
except Exception as e:
    print(f"❌ 读取文件时发生其他错误: {e}")

# 1 - Input

## csv

In [0]:
import pandas as pd
df=pd.read_csv(r"/Volumes/workspace/default/course_data/BDA1_Data/Customer.csv")
df

In [0]:
#python

csv_file = r"/Volumes/workspace/default/course_data/BDA1_Data/tips.csv"

#dataframe
df_from_csv = pd.read_csv(csv_file)

df_from_csv.head()

## excel

In [0]:
# Note: The cell below installs the openpyxl package, which is required for pandas to read .xlsx Excel files. 
# This fixes the ImportError you encountered.

!pip install openpyxl

In [0]:
#python
excel_file = r"/Volumes/workspace/default/course_data/BDA1_Data/AdventureWorks Sales.xlsx"
df_from_excel = pd.read_excel(excel_file)
df_from_excel.head()

## database

In [0]:
# import pyodbc
# import urllib
# import sqlalchemy

# params_datahub = urllib.parse.quote_plus("DRIVER={SQL Server Native Client 11.0};"
#                                  "SERVER=localhost\SQLEXPRESS;"
#                                  "DATABASE=datahub;"
#                                  "UID=sa;"
#                                  "PWD=user1")

# engine_datahub = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect={}".format(params_datahub))


# df_sales_p=pd.read_sql_table('tips',engine_datahub)


# df_sales_p.head()

## internet

In [0]:
import requests
response = requests.get("http://api.open-notify.org/astros.json")
print(response.status_code)
print(response.json())
res = pd.DataFrame(response.json()["people"])
res.head()

# 2- Processing

In [0]:
csv_file = r"/Volumes/workspace/default/course_data/BDA1_Data/tips.csv"
tips = pd.read_csv(csv_file)



In [0]:
tips

## Select

In [0]:
#sql

'''SELECT total_bill, tip, smoker, time FROM tips;'''

In [0]:
#python
tips[["total_bill", "tip", "smoker", "time"]]

In [0]:
#SAS
data tips;
    set tips;
    keep total_bill tip smoker time;
run;


### rename Column

In [0]:
#sql
select *,total_bill as total_bill_2 
from tips

In [0]:
#python
tips.rename(columns={"total_bill": "total_bill_2"})

In [0]:
#sas
data tips;
    set tips;
    rename total_bill=total_bill_2;
run;

### Add Column

In [0]:
#sql
'''SELECT *, tip/total_bill as tip_rate
FROM tips;'''

In [0]:
#python
tips['tip_rate']=tips["tip"] / tips["total_bill"]
# tips.assign(tip_rate=tips["tip"] / tips["total_bill"])

In [0]:
#SAS
data tips;
    set tips;
    tip_rate = tip/total_bill;
run;

### Delete Column

In [0]:
#sql


In [0]:
#python
tips = tips.drop("tip_rate", axis=1)
tips

In [0]:
#sas
data tips;
    set tips;
    drop tip_rate;
run;

## Sort

In [0]:
#sql
select * from tips
order by tip


In [0]:
#python
tips.sort_values(by='tip')

In [0]:
#SAS
proc sort data=tips;
    by sex total_bill;
run;

## Where

In [0]:
#sql
'''SELECT * FROM tips WHERE time = 'Dinner';'''

In [0]:
#python
tips[tips["time"] == "Dinner"]
# tips.query('time=="Dinner"')

In [0]:
#sas
data tips;
    set tips;
    if time="Dinner";
run;

data tips;
    set tips;
    where time="Dinner";
    /* equivalent in this case - where happens before the
       DATA step begins and can also be used in PROC statements */
run;


### i - and or 

In [0]:
#sql
'''SELECT *
FROM tips
WHERE time = 'Dinner' AND tip > 5.00;'''

In [0]:
#python
tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]

In [0]:
#python
tips.query('time=="Dinner" and tip>5')

In [0]:
#sql
'''SELECT *
FROM tips
WHERE size >= 5 OR total_bill > 45;'''

In [0]:
#python
tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]

In [0]:
#python
tips.query('size>=5 or total_bill>45')

In [0]:
#sas
data tips;
    set tips;
    if total_bill > 10;
run;

data tips;
    set tips;
    where total_bill > 10;
    /* equivalent in this case - where happens before the
       DATA step begins and can also be used in PROC statements */
run;

### ii - null 

In [0]:
import numpy as np

frame = pd.DataFrame(
    {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]}
)
frame

In [0]:
#sql
'''SELECT *
FROM frame
WHERE col2 IS NULL;'''

In [0]:
#python
frame[frame["col2"].isna()]

In [0]:
#sql
'''SELECT *
FROM frame
WHERE col1 IS NOT NULL;'''

In [0]:
#python
frame[frame["col1"].notna()]

In [0]:
#sas

data df1;

input key $1 value 12.6;

datalines;
A  0.469112
B -0.282863
C -1.509059
D -1.135632
;

data df2;

input key $1 value 12.6;

datalines;
B  1.212112
D -0.173215
D  0.119209
E -1.044236
;

proc sort data=df1;
    by key;
run;

proc sort data=df2;
    by key;
run;

data left_join inner_join right_join outer_join;
    merge df1(in=a) df2(in=b);

    if a and b then output inner_join;
    if a then output left_join;
    if b then output right_join;
    if a or b then output outer_join;
run;


data outer_join_nulls;
    set outer_join;
    if value_x = .;
run;

data outer_join_no_nulls;
    set outer_join;
    if value_x ^= .;
run;

## Case when 

In [0]:
#sql
select *, 
case when total_bill<10 then 'low' else 'high' end as bucket
from tips


In [0]:
#python
tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
tips

In [0]:
#sas
data tips;
    set tips;
    format bucket $4.;

    if total_bill < 10 then bucket = 'low';
    else bucket = 'high';
run;

## Date

In [0]:
#sql

select '2013-01-15' as date1, '2015-02-15' as date2 , year('2013-01-15') as date1_year, month('2015-02-15') as date2_month,

from tips

In [0]:
#python

tips["date1"] = pd.Timestamp("2013-01-15")

tips["date2"] = pd.Timestamp("2015-02-15")

tips["date1_year"] = tips["date1"].dt.year

tips["date2_month"] = tips["date2"].dt.month

tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin()

tips["months_between"] = tips["date2"].dt.to_period("M") - tips["date1"].dt.to_period("M")

tips[
    ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]
]

In [0]:
#sas
data tips;
    set tips;
    format date1 date2 date1_next mmddyy10.;
    date1 = mdy(1, 15, 2013);
    date2 = mdy(2, 15, 2015);
    date1_year = year(date1);
    date2_month = month(date2);
    * shift date to beginning of next interval;
    date1_next = intnx('MONTH', date1, 1);
    * count intervals between dates;
    months_between = intck('MONTH', date1, date2);
run;


## String processing

In [0]:
#PYTHON 
tips["time"].str.len()

tips["time"].str.rstrip().str.len()

In [0]:
#SAS

data _null_;
set tips;
a=LENGTHN(time);
b=LENGTHC(time);
put a=;
put b=;
run;

In [0]:
#python
tips["sex"].str.find("ale")

In [0]:
#sas
data _null_;
set tips;
a=FIND(sex,'ale');
put a=;
run;

In [0]:
#python
tips["sex"].str[0:1]

In [0]:
#SAS
data _null_;
set tips;
put(substr(sex,1,1));
run;

In [0]:
#python

firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]})

firstlast["First_Name"] = firstlast["String"].str.split(" ", expand=True)[0]

firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[1]

firstlast

In [0]:
#sas
data firstlast;
input String $60.;
First_Name = scan(string, 1);
Last_Name = scan(string, -1);*count from right;
datalines;
John Smith
Jane Cook
;
run;

In [0]:
#python
firstlast = pd.DataFrame({"string": ["John Smith", "Jane Cook"]})

firstlast["upper"] = firstlast["string"].str.upper()

firstlast["lower"] = firstlast["string"].str.lower()

firstlast["title"] = firstlast["string"].str.title()

firstlast

In [0]:
#sas
data firstlast;
input String $60.;
string_up = UPCASE(string);
string_low = LOWCASE(string);
string_prop = PROPCASE(string);
datalines2;
John Smith
Jane Cook
;;;
run;

## Group

In [0]:
#sql
'''SELECT sex, count(*)
FROM tips
GROUP BY sex;'''

In [0]:
#python
tips.groupby("sex").size()

In [0]:
#python
tips.groupby("sex")["total_bill"].count()

In [0]:
#sql
'''SELECT day, AVG(tip), COUNT(*)
FROM tips
GROUP BY day;'''

In [0]:
#python
tips.groupby("day").agg({"tip": np.mean, "day": np.size})

In [0]:
#sql
'''SELECT smoker, day, COUNT(*), AVG(tip)
FROM tips
GROUP BY smoker, day;'''

In [0]:
#python
tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})

In [0]:
#SAS

proc summary data=tips nway;
    class sex smoker;
    var total_bill tip;
    output out=tips_summed sum=;
run;

or 

PROC SQL;
SELECT smoker, day, COUNT(*), AVG(tip)
FROM tips
GROUP BY smoker, day;
QUIT;


In [0]:
#sas
proc sort data=tips;
   by sex smoker;
run;

data tips_first;
    set tips;
    by sex smoker;
    if FIRST.sex or FIRST.smoker then output;
run;

## join

In [0]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value1": np.random.randn(4)})

df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value2": np.random.randn(4)})

In [0]:
#sql

'''SELECT *
FROM df1
INNER JOIN df2
  ON df1.key = df2.key;'''

In [0]:
#python
pd.merge(df1, df2, on="key")

In [0]:
#sql
'''SELECT *
FROM df1
LEFT OUTER JOIN df2
  ON df1.key = df2.key;'''

In [0]:
#python
pd.merge(df1, df2, on="key", how="left")

In [0]:
#sql
'''SELECT *
FROM df1
RIGHT OUTER JOIN df2
  ON df1.key = df2.key;'''

In [0]:
#python

pd.merge(df1, df2, on="key", how="right")

In [0]:
#SAS
proc sort data=df1;
    by key;
run;

proc sort data=df2;
    by key;
run;

data left_join inner_join right_join outer_join;
    merge df1(in=a) df2(in=b);

    if a and b then output inner_join;
    if a then output left_join;
    if b then output right_join;
    if a or b then output outer_join;
run;

## Union

In [0]:
df1 = pd.DataFrame({"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)})

df2 = pd.DataFrame({"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]})
display(df1)
display(df2)

In [0]:
#SQL
'''SELECT city, rank
FROM df1
UNION ALL
SELECT city, rank
FROM df2;'''
/*
         city  rank
      Chicago     1
San Francisco     2
New York City     3
      Chicago     1
       Boston     4
  Los Angeles     5
*/

In [0]:
#PYTHON
pd.concat([df1, df2])

In [0]:
#SQL

'''SELECT city, rank
FROM df1
UNION
SELECT city, rank
FROM df2;'''
-- notice that there is only one Chicago record this time
/*
         city  rank
      Chicago     1
San Francisco     2
New York City     3
       Boston     4
  Los Angeles     5
*/

In [0]:
#PYTHON
pd.concat([df1, df2]).drop_duplicates()

In [0]:
#SAS

DATA TIPS_UNION;

SET TIPS TIPS;

RUN;


PROC SORT DATA = TIPS_UNION NODUP;
BY _all_;
RUN;


# 3 - Output

## to csv

In [0]:
tips

In [0]:
#python
tips.to_csv("tips2.csv")

In [0]:
#SAS
proc export data=tips outfile='tips2.csv' dbms=csv;
run;

## to database

In [0]:
#python
tips.to_sql('tips_output_3',engine_datahub,if_exists='replace',index=False)