# Reading in the data

In [2]:
import pandas as pd
import os

df = pd.read_csv('data/raw/data.csv', encoding='ISO-8859-1')

df

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/data.csv'

# Basic Information

In [None]:
df.info()

## Numerical columns

In [None]:
print(df.describe())

## Categorical columns

In [None]:
print(df.describe(include = 'O'))

# Data Types Overview

In [None]:
print(df.dtypes)

# Unique Values

In [None]:
for col in df.select_dtypes(include='O').columns:
    print(f"Unique values for {col}: {df[col].nunique()}")

# Check for Missing Values

# Visualizing Missing Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,7))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Data Visualization')
plt.show()

## Identify missing values

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

# Percentage of missing values can also be useful
missing_percentage = (df.isnull().sum() / len(df)) * 100
print(missing_percentage)

## Handle missing values

In [None]:
# Filling missing values in 'Description' with NO DESCRIPTION AVAILABLE since less than 0.5% of values in 'Description' are missing
df['Description'].fillna('NO DESCRIPTION AVAILABLE')

# Dropping CustomerID as it will not be needed for this analysis
df = df.drop(columns=['CustomerID'])

df

# Date-time Conversion

In [None]:
df.dtypes

In [None]:
# Convert 'InvoiceDate' from object format to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [None]:
df.dtypes

## Extracting date features

In [None]:
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['Weekday'] = df['InvoiceDate'].dt.weekday

# Save Cleaned Data

In [None]:
df.to_csv('/ECommerce-Sales-Analysis/data/processed/cleaned_data.csv', index=False)