# Stage-03: Python Fundamentals

The goal of this notebook is to implement some of the python fundamentals we had gone through in class.

In [3]:
# Some initial import statements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
# Loading the initial dataset
df = pd.read_csv('data/starter_data.csv')
df

Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05
5,C,30,2025-08-06
6,A,11,2025-08-07
7,B,14,2025-08-08
8,C,28,2025-08-09
9,A,13,2025-08-10


In [10]:
# Use .info() and .head() to understand the dataset
print(df.info())   
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes
None


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [11]:
# Summary statistics of the dataset
print(df.describe())
# Now use groupby to find the average of 'value' for each 'category'
average_by_category = df.groupby('category')[['value']].mean()
average_by_category

           value
count  10.000000
mean   17.600000
std     7.381659
min    10.000000
25%    12.250000
50%    14.500000
75%    23.250000
max    30.000000


Unnamed: 0_level_0,value
category,Unnamed: 1_level_1
A,11.5
B,15.666667
C,27.666667


In [13]:
# Now want to read the summary statistics of the dataset into a csv file
summary_stats = df.describe()
# Need to first create the directory if it doesn't exist
import os
if not os.path.exists('data/processed'):
    os.makedirs('data/processed')
summary_stats.to_csv('data/processed/summary.csv')

In [14]:
# Now final thing to do is to create a utility function that can be used again.
def get_summary_statistics(df):
    """
    This function computes the summary statistics of a DataFrame 

    Parameters:
    df (DataFrame): The input DataFrame for which to compute summary statistics.

    """
    summary_stats = df.describe()
    return summary_stats

In [16]:
get_summary_statistics(df)

Unnamed: 0,value
count,10.0
mean,17.6
std,7.381659
min,10.0
25%,12.25
50%,14.5
75%,23.25
max,30.0
