In [1]:
# import packages/modules
import pandas as pd
import os

In [2]:
#list all the files from the directory
file_list = os.listdir("./data")
file_list

['Sales_December_2019.csv',
 'Sales_April_2019.csv',
 'Sales_February_2019.csv',
 'Sales_March_2019.csv',
 'Sales_August_2019.csv',
 'Sales_May_2019.csv',
 'Sales_November_2019.csv',
 'Sales_October_2019.csv',
 'Sales_January_2019.csv',
 'Sales_September_2019.csv',
 'Sales_July_2019.csv',
 'Sales_June_2019.csv']

----
## Import Data

In [3]:
# empty dataframe
yearly_sales_data_2019 = pd.DataFrame()

# joining all months data together
for file in file_list:
    temp_data = pd.read_csv("data/"+file)
    yearly_sales_data_2019 = pd.concat([yearly_sales_data_2019, temp_data])

----
## Cleaning Data

In [4]:
# sorting by order date
yearly_sales_data_2019 = yearly_sales_data_2019.sort_values("Order Date")

In [5]:
# any rows with nan??
yearly_sales_data_2019[yearly_sales_data_2019.isna().any(axis=1)]

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
264,,,,,,
648,,,,,,
680,,,,,,
1385,,,,,,
1495,,,,,,
...,...,...,...,...,...,...
12567,,,,,,
12640,,,,,,
12659,,,,,,
12732,,,,,,


In [6]:
# dropping all rows with all values with nan
yearly_sales_data_2019 = yearly_sales_data_2019.dropna(how='all')
yearly_sales_data_2019

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
6344,147268,Wired Headphones,1,11.99,01/01/19 03:07,"9 Lake St, New York City, NY 10001"
7154,148041,USB-C Charging Cable,1,11.95,01/01/19 03:40,"760 Church St, San Francisco, CA 94016"
8507,149343,Apple Airpods Headphones,1,150,01/01/19 04:56,"735 5th St, New York City, NY 10001"
9161,149964,AAA Batteries (4-pack),1,2.99,01/01/19 05:53,"75 Jackson St, Dallas, TX 75001"
8514,149350,USB-C Charging Cable,2,11.95,01/01/19 06:03,"943 2nd St, Atlanta, GA 30301"
...,...,...,...,...,...,...
11960,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
12332,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
11574,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
4022,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


In [7]:
# looking into duplicate header rows
yearly_sales_data_2019[yearly_sales_data_2019["Order ID"] == "Order ID"]

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
21148,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
4466,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
8799,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
14233,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
5074,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
...,...,...,...,...,...,...
11960,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
12332,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
11574,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
4022,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


In [8]:
# confirming the last 355 rows are duplicate header rows
yearly_sales_data_2019.tail(356)

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
2117,297674,AAA Batteries (4-pack),1,2.99,12/31/19 23:53,"425 Lake St, Portland, OR 97035"
21148,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
4466,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
8799,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
14233,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
...,...,...,...,...,...,...
11960,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
12332,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
11574,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
4022,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


In [9]:
# dropping the last 355 rows by slicing
yearly_sales_data_2019 = yearly_sales_data_2019[:-355]

In [10]:
# reset index 
yearly_sales_data_2019 = yearly_sales_data_2019.reset_index(drop=True)
yearly_sales_data_2019

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,147268,Wired Headphones,1,11.99,01/01/19 03:07,"9 Lake St, New York City, NY 10001"
1,148041,USB-C Charging Cable,1,11.95,01/01/19 03:40,"760 Church St, San Francisco, CA 94016"
2,149343,Apple Airpods Headphones,1,150,01/01/19 04:56,"735 5th St, New York City, NY 10001"
3,149964,AAA Batteries (4-pack),1,2.99,01/01/19 05:53,"75 Jackson St, Dallas, TX 75001"
4,149350,USB-C Charging Cable,2,11.95,01/01/19 06:03,"943 2nd St, Atlanta, GA 30301"
...,...,...,...,...,...,...
185945,297481,AA Batteries (4-pack),1,3.84,12/31/19 23:42,"82 Hill St, Dallas, TX 75001"
185946,298406,AAA Batteries (4-pack),2,2.99,12/31/19 23:42,"30 Elm St, San Francisco, CA 94016"
185947,317048,AAA Batteries (4-pack),2,2.99,12/31/19 23:45,"743 Adams St, San Francisco, CA 94016"
185948,309773,AAA Batteries (4-pack),1,2.99,12/31/19 23:52,"60 Hickory St, Los Angeles, CA 90001"


----
## Exploring the data

In [15]:
yearly_sales_data_2019.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,147268,Wired Headphones,1,11.99,01/01/19 03:07,"9 Lake St, New York City, NY 10001"
1,148041,USB-C Charging Cable,1,11.95,01/01/19 03:40,"760 Church St, San Francisco, CA 94016"
2,149343,Apple Airpods Headphones,1,150.0,01/01/19 04:56,"735 5th St, New York City, NY 10001"
3,149964,AAA Batteries (4-pack),1,2.99,01/01/19 05:53,"75 Jackson St, Dallas, TX 75001"
4,149350,USB-C Charging Cable,2,11.95,01/01/19 06:03,"943 2nd St, Atlanta, GA 30301"


In [16]:
yearly_sales_data_2019.tail()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
185945,297481,AA Batteries (4-pack),1,3.84,12/31/19 23:42,"82 Hill St, Dallas, TX 75001"
185946,298406,AAA Batteries (4-pack),2,2.99,12/31/19 23:42,"30 Elm St, San Francisco, CA 94016"
185947,317048,AAA Batteries (4-pack),2,2.99,12/31/19 23:45,"743 Adams St, San Francisco, CA 94016"
185948,309773,AAA Batteries (4-pack),1,2.99,12/31/19 23:52,"60 Hickory St, Los Angeles, CA 90001"
185949,297674,AAA Batteries (4-pack),1,2.99,12/31/19 23:53,"425 Lake St, Portland, OR 97035"


In [18]:
yearly_sales_data_2019.shape

(185950, 6)

In [11]:
yearly_sales_data_2019.columns

Index(['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date',
       'Purchase Address'],
      dtype='object')

In [12]:
yearly_sales_data_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Order ID          185950 non-null  object
 1   Product           185950 non-null  object
 2   Quantity Ordered  185950 non-null  object
 3   Price Each        185950 non-null  object
 4   Order Date        185950 non-null  object
 5   Purchase Address  185950 non-null  object
dtypes: object(6)
memory usage: 8.5+ MB


In [21]:
yearly_sales_data_2019.describe()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
count,185950,185950,185950,185950.0,185950,185950
unique,178437,19,9,23.0,142395,140787
top,160873,USB-C Charging Cable,1,11.95,12/15/19 20:16,"193 Forest St, San Francisco, CA 94016"
freq,5,21903,168552,21903.0,8,9
