In [1]:
import numpy as np

arr = np.array([1, 2, 3, 4, 5])

print("square:", arr ** 2)
print("log:", np.log(arr))

square: [ 1  4  9 16 25]
log: [0.         0.69314718 1.09861229 1.38629436 1.60943791]


In [2]:
import time

start = time.time()
result_vec = arr * 2 + 5
end = time.time()
print(f"Vectorized: {end - start:.6f}s")

# 循环操作
start = time.time()
result_loop = [x * 2 + 5 for x in arr]
end = time.time()
print(f"Loop: {end - start:.6f}s")

Vectorized: 0.000035s
Loop: 0.000034s


In [3]:
import pandas as pd
df = pd.read_csv('../data/starter_data.csv', index_col='date')

print("=== data preview ===")
print(df.info())

print("\n=== first 5 cols ===")
print(df.head())

=== data preview ===
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 2025-08-01 to 2025-08-10
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 240.0+ bytes
None

=== first 5 cols ===
           category  value
date                      
2025-08-01        A     10
2025-08-02        B     15
2025-08-03        A     12
2025-08-04        B     18
2025-08-05        C     25


In [4]:

numeric_summary = df.describe()
print(numeric_summary)



grouped = df.groupby('category').sum()
print(grouped)

           value
count  10.000000
mean   17.600000
std     7.381659
min    10.000000
25%    12.250000
50%    14.500000
75%    23.250000
max    30.000000
          value
category       
A            46
B            47
C            83


In [5]:
numeric_summary.to_csv('../data/processed/summary.csv')
grouped.to_csv('../data/processed/grouped_summary.csv')


In [6]:
def get_summary_stats(dataframe, group_col=None):

    results = {}
    
    results['overall'] = dataframe.describe().to_dict()
    
    if group_col and group_col in dataframe.columns:
        grouped = dataframe.groupby(group_col).agg(['mean', 'median', 'std'])
        results['grouped'] = grouped.to_dict()
    
    return results


stats = get_summary_stats(df, 'category')
print(stats)

{'overall': {'value': {'count': 10.0, 'mean': 17.6, 'std': 7.381658952355418, 'min': 10.0, '25%': 12.25, '50%': 14.5, '75%': 23.25, 'max': 30.0}}, 'grouped': {('value', 'mean'): {'A': 11.5, 'B': 15.666666666666666, 'C': 27.666666666666668}, ('value', 'median'): {'A': 11.5, 'B': 15.0, 'C': 28.0}, ('value', 'std'): {'A': 1.2909944487358056, 'B': 2.0816659994661326, 'C': 2.516611478423583}}}
