In [2]:
import pandas as pd

### Dataset Description
This dataset contains 1000 sales records with the following columns:

- product: Product name (with intentional variations in naming/capitalization)
- country: Country of sale (with some variations in country name formats)
- status: Order status
- quantity: Number of items ordered

### Data Characteristics

- Inconsistent product names (e.g., "Laptop" vs "laptop")
- Country names with variations (e.g., "USA", "U.S.A.", "United States")
- Order statuses with realistic distribution (majority "Delivered", fewer "Cancelled")
- Quantity ranging from 1-9 items per order

In [6]:
df = pd.read_csv('sales_data.csv')

# Basic value_counts - shows all variations in product names and their frequencies
print("Product value counts:")
print(df['product'].value_counts())

Product value counts:
product
Laptop        166
Tablet        112
Headphones    107
laptop        103
SmartPhone    102
TV             86
Smartphone     82
smartphone     56
LAPTOP         50
headphones     46
television     46
Television     44
Name: count, dtype: int64
RangeIndex(start=0, stop=1000, step=1)


In [4]:
# Value counts with percentages
print("\nCountry distribution (in percentages):")
print(df['country'].value_counts(normalize=True))


Country distribution (in percentages):
country
USA               0.203
UK                0.145
United Kingdom    0.123
Canada            0.103
Germany           0.093
U.S.A.            0.093
United States     0.089
France            0.059
CANADA            0.052
U.K.              0.040
Name: proportion, dtype: float64


In [5]:
# Value counts sorted by values (not frequency)
print("\nStatuses sorted alphabetically:")
# print(df['status'].value_counts(sort=index=True))
print(df['status'].value_counts().sort_index())   # sorted VALUES alphabetically
print(df['status'].value_counts(ascending=True))  # sorted by frequencies - how many for each unique value appears


Statuses sorted alphabetically:
status
Cancelled      53
Delivered     499
Pending       156
Processing    101
Shipped       191
Name: count, dtype: int64
status
Cancelled      53
Processing    101
Pending       156
Shipped       191
Delivered     499
Name: count, dtype: int64


In [6]:
# See unique values to spot inconsistencies
print("\nAll unique product names:")
print(df['product'].unique())


All unique product names:
['SmartPhone' 'headphones' 'TV' 'Tablet' 'laptop' 'Laptop' 'Headphones'
 'television' 'smartphone' 'Smartphone' 'LAPTOP' 'Television']
