In [1]:
#Data loading and imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Importing the datasets using the paths provided
sales_daily = pd.read_csv('/Users/arka_bagchi/Desktop/Springboard/pharma_sales_data/salesdaily.csv')
sales_hourly = pd.read_csv('/Users/arka_bagchi/Desktop/Springboard/pharma_sales_data/saleshourly.csv')
sales_monthly = pd.read_csv('/Users/arka_bagchi/Desktop/Springboard/pharma_sales_data/salesmonthly.csv')
sales_weekly = pd.read_csv('/Users/arka_bagchi/Desktop/Springboard/pharma_sales_data/salesweekly.csv')


In [2]:
sales_daily.shape

(2106, 13)

In [3]:
# Display column names and their data types
sales_daily.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2106 entries, 0 to 2105
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   datum         2106 non-null   object 
 1   M01AB         2106 non-null   float64
 2   M01AE         2106 non-null   float64
 3   N02BA         2106 non-null   float64
 4   N02BE         2106 non-null   float64
 5   N05B          2106 non-null   float64
 6   N05C          2106 non-null   float64
 7   R03           2106 non-null   float64
 8   R06           2106 non-null   float64
 9   Year          2106 non-null   int64  
 10  Month         2106 non-null   int64  
 11  Hour          2106 non-null   int64  
 12  Weekday Name  2106 non-null   object 
dtypes: float64(8), int64(3), object(2)
memory usage: 214.0+ KB


In [4]:
# Descriptive statistics of the dataframe
sales_daily.describe()

Unnamed: 0,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour
count,2106.0,2106.0,2106.0,2106.0,2106.0,2106.0,2106.0,2106.0,2106.0,2106.0,2106.0
mean,5.033683,3.89583,3.880441,29.917095,8.853627,0.593522,5.512262,2.900198,2016.401235,6.344255,275.945869
std,2.737579,2.133337,2.38401,15.590966,5.605605,1.092988,6.428736,2.415816,1.66506,3.386954,1.970547
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014.0,1.0,190.0
25%,3.0,2.34,2.0,19.0,5.0,0.0,1.0,1.0,2015.0,3.0,276.0
50%,4.99,3.67,3.5,26.9,8.0,0.0,4.0,2.0,2016.0,6.0,276.0
75%,6.67,5.138,5.2,38.3,12.0,1.0,8.0,4.0,2018.0,9.0,276.0
max,17.34,14.463,16.0,161.0,54.833333,9.0,45.0,15.0,2019.0,12.0,276.0


In [5]:
# Convert 'datum' column to datetime format
sales_daily['datum'] = pd.to_datetime(sales_daily['datum'])

In [6]:
sales_daily.head()

Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour,Weekday Name
0,2014-01-02,0.0,3.67,3.4,32.4,7.0,0.0,0.0,2.0,2014,1,248,Thursday
1,2014-01-03,8.0,4.0,4.4,50.6,16.0,0.0,20.0,4.0,2014,1,276,Friday
2,2014-01-04,2.0,1.0,6.5,61.85,10.0,0.0,9.0,1.0,2014,1,276,Saturday
3,2014-01-05,4.0,3.0,7.0,41.1,8.0,0.0,3.0,0.0,2014,1,276,Sunday
4,2014-01-06,5.0,1.0,4.5,21.7,16.0,2.0,6.0,2.0,2014,1,276,Monday


In [7]:
# Check for missing values in sales_daily
missing_values = sales_daily.isnull().sum()

missing_values

datum           0
M01AB           0
M01AE           0
N02BA           0
N02BE           0
N05B            0
N05C            0
R03             0
R06             0
Year            0
Month           0
Hour            0
Weekday Name    0
dtype: int64

In [8]:
# Get data type of each column in sales_daily, check if datum is now datetime
sales_daily.dtypes

datum           datetime64[ns]
M01AB                  float64
M01AE                  float64
N02BA                  float64
N02BE                  float64
N05B                   float64
N05C                   float64
R03                    float64
R06                    float64
Year                     int64
Month                    int64
Hour                     int64
Weekday Name            object
dtype: object

In [9]:
sales_daily['M01AB'].median()

4.99

In [11]:
sales_daily['M01AE'].mean()

3.895830316160029

In [12]:
sales_daily['N02BE'].median()

26.9

In [14]:
# Find the minimum and maximum dates in the 'datum' column now that 
#we've converted 'datum' to datetime

sales_daily['datum'].min()

Timestamp('2014-01-02 00:00:00')

In [15]:
sales_daily['datum'].max()

Timestamp('2019-10-08 00:00:00')

In [16]:
# Define a function to detect outliers using the IQR method
def detect_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Return a boolean series where True indicates the presence of an outlier
    return (data[column] < lower_bound) | (data[column] > upper_bound)

# List of drug class columns
drug_columns = ['M01AB', 'M01AE', 'N02BA', 'N02BE', 'N05B', 'N05C', 'R03', 'R06']

# Check for outliers in each drug class column
outliers_dict = {}
for column in drug_columns:
    outliers_dict[column] = detect_outliers(sales_daily, column).sum()

outliers_dict

{'M01AB': 26,
 'M01AE': 40,
 'N02BA': 31,
 'N02BE': 48,
 'N05B': 48,
 'N05C': 160,
 'R03': 124,
 'R06': 81}

In [17]:
#It seems that the columns N05C (Psycholeptics drugs, Hypnotics, and sedatives) 
#and R03 (Drugs for obstructive airway diseases) have a relatively higher number of outliers. 
#These outliers could be due to actual spikes in sales or potential errors in the data. 


In [20]:
# Calculate the range of values for each drug class column
range_values = sales_daily[drug_columns].max() - sales_daily[drug_columns].min()

range_values

M01AB     17.340000
M01AE     14.463000
N02BA     16.000000
N02BE    161.000000
N05B      54.833333
N05C       9.000000
R03       45.000000
R06       15.000000
dtype: float64