In [3]:
print('setup working')

setup working


In [20]:

# use Python's subprocess when you need to capture or display output programmatically
import subprocess

commands = [
    ["git", "rev-parse", "--is-inside-work-tree"],
    ["git", "status"],
    ["git", "remote", "-v"]
]

for cmd in commands:
    try:
        proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
        print("$", " ".join(cmd))
        if proc.stdout:
            print(proc.stdout.strip())
        if proc.stderr:
            print(proc.stderr.strip())
    except FileNotFoundError:
        print('git is not installed or not found in PATH. Please install Git (https://git-scm.com) and restart the notebook kernel.')

git is not installed or not found in PATH. Please install Git (https://git-scm.com) and restart the notebook kernel.
git is not installed or not found in PATH. Please install Git (https://git-scm.com) and restart the notebook kernel.
git is not installed or not found in PATH. Please install Git (https://git-scm.com) and restart the notebook kernel.


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [6]:
#read csv 
df = pd.read_csv('orders.csv', delimiter=";")
print(df.head())

         Order ID   Segment    Ship Mode   Value
0  CA-2011-103366  Consumer  First Class  149,95
1  CA-2011-109043  Consumer  First Class   243,6
2  CA-2011-113166  Consumer  First Class   9,568
3  CA-2011-124023  Consumer  First Class    8,96
4  CA-2011-130155  Consumer  First Class    34,2


In [7]:
#remove white spaces
df.columns = df.columns.str.strip()

In [8]:
df.head()

Unnamed: 0,Order ID,Segment,Ship Mode,Value
0,CA-2011-103366,Consumer,First Class,14995
1,CA-2011-109043,Consumer,First Class,2436
2,CA-2011-113166,Consumer,First Class,9568
3,CA-2011-124023,Consumer,First Class,896
4,CA-2011-130155,Consumer,First Class,342


In [9]:
#check shape and info
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 834 entries, 0 to 833
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Order ID   834 non-null    object
 1   Segment    127 non-null    object
 2   Ship Mode  834 non-null    object
 3   Value      834 non-null    object
dtypes: object(4)
memory usage: 26.2+ KB


In [10]:
#check stats
df.describe()

Unnamed: 0,Order ID,Segment,Ship Mode,Value
count,834,127,834,834
unique,823,3,4,810
top,Grand Total,Consumer,Standard Class,15552
freq,12,73,498,4


In [12]:
#data cleaning
#check for missing values
missing = df.isnull().sum().sort_values(ascending=False)
missing_perc = (df.isnull().sum()/len(df)*100).sort_values(ascending=False)
pd.concat([missing, missing_perc], axis=1, keys=['Missing Values', '% of Total Values']).transpose()        


Unnamed: 0,Segment,Order ID,Ship Mode,Value
Missing Values,707.0,0.0,0.0,0.0
% of Total Values,84.772182,0.0,0.0,0.0


In [14]:
#use mode to fill missing values in Segment column
df['Segment'].fillna(df['Segment'].mode()[0], inplace=True)

In [15]:
#check missing value again
df.isnull().sum()

Order ID     0
Segment      0
Ship Mode    0
Value        0
dtype: int64

In [16]:
#check duplicates 
duplicates= df.duplicated().sum()
print(f'The number of duplicate rows is: {duplicates}')


The number of duplicate rows is: 0


In [17]:
#change data types
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')
df['Segment'] = df['Segment'].astype('category')
df['Ship Mode'] = df['Ship Mode'].astype('category')    