# Summary statistics

In [1]:
import pandas as pd
sales = pd.read_csv('datasets/sales.csv')

## Mean and median

In [2]:
# Print the head of the sales DataFrame
print(sales.head())

# Print the info about the sales DataFrame
print(sales.info())

# Print the mean of weekly_sales
print(sales['weekly_sales'].mean())

# Print the median of weekly_sales
print(sales['weekly_sales'].median())

   store type  department        date  weekly_sales  is_holiday  \
0      1    A           1  2010-02-05      24924.50       False   
1      1    A           1  2010-03-05      21827.90       False   
2      1    A           1  2010-04-02      57258.43       False   
3      1    A           1  2010-05-07      17413.94       False   
4      1    A           1  2010-06-04      17558.09       False   

   temperature_c  fuel_price_usd_per_l  unemployment  
0       5.727778              0.679451         8.106  
1       8.055556              0.693452         8.106  
2      16.816667              0.718284         7.808  
3      22.527778              0.748928         7.808  
4      27.050000              0.714586         7.808  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10774 entries, 0 to 10773
Data columns (total 9 columns):
store                   10774 non-null int64
type                    10774 non-null object
department              10774 non-null int64
date                   

## Summarizing dates


In [3]:
# Print the maximum of the date column
print(sales['date'].max())

# Print the minimum of the date column
print(sales['date'].min())

2012-10-26
2010-02-05


## Efficient summaries


In [4]:
# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)
    
# Print IQR of the temperature_c column
print(sales["temperature_c"].agg(iqr))

16.58333333333334


## Cumulative statistics

In [7]:
sales_1_1 = pd.read_csv('datasets/sales_1_1.csv')

# Sort sales_1_1 by date
sales_1_1 = sales_1_1.sort_values('date')

# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
sales_1_1['cum_weekly_sales'] = sales_1_1['weekly_sales'].cumsum()

# Get the cumulative max of weekly_sales, add as cum_max_sales col
sales_1_1['cum_max_sales'] = sales_1_1['weekly_sales'].cummax()

# See the columns you calculated
print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])

          date  weekly_sales  cum_weekly_sales  cum_max_sales
1   2010-02-05      24924.50          24924.50       24924.50
8   2010-03-05      21827.90          46752.40       24924.50
11  2010-04-02      57258.43         104010.83       57258.43
7   2010-05-07      17413.94         121424.77       57258.43
2   2010-06-04      17558.09         138982.86       57258.43
0   2010-07-02      16333.14         155316.00       57258.43
9   2010-08-06      17508.41         172824.41       57258.43
5   2010-09-03      16241.78         189066.19       57258.43
4   2010-10-01      20094.19         209160.38       57258.43
3   2010-11-05      34238.88         243399.26       57258.43
6   2010-12-03      22517.56         265916.82       57258.43
10  2011-01-07      15984.24         281901.06       57258.43


# Counting

## Dropping Duplicates

In [8]:
sales = pd.read_csv('datasets/sales.csv')

In [9]:
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=['store', 'type'])
print(store_types.head())

# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=['store', 'department'])
print(store_depts.head())

# Subset the rows that are holiday weeks and drop duplicate dates
holiday_dates = sales[sales['is_holiday']].drop_duplicates(subset='date')

# Print date col of holiday_dates
print(holiday_dates['date'])

      store type  department        date  weekly_sales  is_holiday  \
0         1    A           1  2010-02-05      24924.50       False   
901       2    A           1  2010-02-05      35034.06       False   
1798      4    A           1  2010-02-05      38724.42       False   
2699      6    A           1  2010-02-05      25619.00       False   
3593     10    B           1  2010-02-05      40212.84       False   

      temperature_c  fuel_price_usd_per_l  unemployment  
0          5.727778              0.679451         8.106  
901        4.550000              0.679451         8.324  
1798       6.533333              0.686319         8.623  
2699       4.683333              0.679451         7.259  
3593      12.411111              0.782478         9.765  
    store type  department        date  weekly_sales  is_holiday  \
0       1    A           1  2010-02-05      24924.50       False   
12      1    A           2  2010-02-05      50605.27       False   
24      1    A           3 

## Counting categorical variables

In [24]:
stores = pd.read_csv('datasets/stores.csv')
departments = pd.read_csv('datasets/departments.csv')

In [25]:
# Count the number of stores of each type
store_counts = stores["store_type"].value_counts()
print(store_counts)

# Get the proportion of stores of each type
store_props = stores["store_type"].value_counts(normalize=True)
print(store_props)

# Count the number of each department number and sort
dept_counts_sorted = departments["department_num"].value_counts(sort=True)
print(dept_counts_sorted)

# Get the proportion of departments of each number and sort
dept_props_sorted = departments["department_num"].value_counts(sort=True, normalize=True)
print(dept_props_sorted)

A    11
B     1
Name: store_type, dtype: int64
A    0.916667
B    0.083333
Name: store_type, dtype: float64
41    12
30    12
23    12
24    12
25    12
26    12
27    12
28    12
29    12
31    12
21    12
32    12
33    12
34    12
35    12
36    12
38    12
40    12
22    12
20    12
42    12
9     12
2     12
3     12
4     12
5     12
6     12
7     12
8     12
10    12
      ..
82    12
83    12
85    12
87    12
90    12
91    12
67    12
60    12
59    12
58    12
56    12
55    12
54    12
52    12
51    12
94    12
49    12
95    12
47    12
46    12
96    12
45    12
97    12
92    12
99    11
37    10
48     8
50     6
39     4
43     2
Name: department_num, Length: 80, dtype: int64
41    0.012917
30    0.012917
23    0.012917
24    0.012917
25    0.012917
26    0.012917
27    0.012917
28    0.012917
29    0.012917
31    0.012917
21    0.012917
32    0.012917
33    0.012917
34    0.012917
35    0.012917
36    0.012917
38    0.012917
40    0.012917
22    0.012917
20    0.012