In [None]:
import pandas as pd

# Load CSVs
train = pd.read_csv('/mnt/data/train.csv')
test = pd.read_csv('/mnt/data/test.csv')
store = pd.read_csv('/mnt/data/store.csv')

# Preview shapes and data
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Store shape:", store.shape)

train.head()

In [None]:
# Merge store info into train and test
train_merged = pd.merge(train, store, on='Store', how='left')
test_merged = pd.merge(test, store, on='Store', how='left')

# Convert Date to datetime
train_merged['Date'] = pd.to_datetime(train_merged['Date'])
test_merged['Date'] = pd.to_datetime(test_merged['Date'])

# Check merged shapes
print("Merged train shape:", train_merged.shape)
print("Merged test shape:", test_merged.shape)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

missing = train_merged.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=missing.values, y=missing.index)
plt.title("Missing Values by Feature")
plt.xlabel("Count")
plt.ylabel("Feature")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(train_merged['Sales'], bins=100, kde=True)
plt.title("Distribution of Daily Sales")
plt.xlabel("Sales")
plt.ylabel("Frequency")
plt.show()

In [None]:
daily_sales = train_merged.groupby('Date')['Sales'].sum()

plt.figure(figsize=(15,5))
daily_sales.plot()
plt.title("Total Sales Over Time")
plt.ylabel("Sales")
plt.xlabel("Date")
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x='StoreType', y='Sales', data=train_merged)
plt.title("Sales Distribution by Store Type")
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x='Promo', y='Sales', data=train_merged)
plt.title("Sales During Promo vs Non-Promo Periods")
plt.show()