In [4]:


# Step 1: NumPy Operations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

# Create an array and perform elementwise operations
arr = np.array([1, 2, 3, 4, 5])
arr_squared = arr ** 2
arr_plus_ten = arr + 10

print("Original:", arr)
print("Squared:", arr_squared)
print("Plus Ten:", arr_plus_ten)

# Compare loop vs vectorized execution
import time

# Loop
start = time.time()
loop_squared = [x**2 for x in arr]
end = time.time()
print(f"Loop time: {end - start:.6f} sec")

# Vectorized
start = time.time()
vectorized_squared = arr ** 2
end = time.time()
print(f"Vectorized time: {end - start:.6f} sec")

# Step 2: Dataset Loading
df = pd.read_csv("data/starter_data.csv")
print(df.info())
print(df.head())

# Step 3: Summary Statistics
summary_stats = df.describe()
print(summary_stats)

# Group by category (replace 'category_column' with actual column name)
grouped_stats = df.groupby("category").mean(numeric_only=True)
print(grouped_stats)

# Step 4: Save Outputs
# Save summary stats to CSV
summary_stats.to_csv("data/processed/summary.csv", index=False)

# Save summary stats to JSON
summary_stats.to_json("data/processed/summary.json")

# Bonus: Create and save a basic plot
df["value"].hist()
plt.title("Histogram of numeric_column")
plt.savefig("data/processed/histogram.png")
plt.close()

# Step 5: Reusable Functions
def get_summary_stats(df):
    """Return summary statistics for a dataframe."""
    return df.describe()

# Bonus: You could move the function to src/utils.py
# and then import it:
# from src.utils import get_summary_stats


Original: [1 2 3 4 5]
Squared: [ 1  4  9 16 25]
Plus Ten: [11 12 13 14 15]
Loop time: 0.000152 sec
Vectorized time: 0.000082 sec
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes
None
  category  value        date
0        A     10  2025-08-01
1        B     15  2025-08-02
2        A     12  2025-08-03
3        B     18  2025-08-04
4        C     25  2025-08-05
           value
count  10.000000
mean   17.600000
std     7.381659
min    10.000000
25%    12.250000
50%    14.500000
75%    23.250000
max    30.000000
              value
category           
A         11.500000
B         15.666667
C         27.666667
