In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [None]:
data = pd.read_csv("data.csv", encoding='latin1')


In [None]:
print(data.info())
print(data.describe())

In [None]:
# 2. Handle missing values
data.dropna(subset=["InvoiceNo", "Description"], inplace=True)  # Drop rows with critical missing values
data["UnitPrice"].fillna(data["UnitPrice"].median(), inplace=True)  # Impute missing prices


In [None]:
data.drop_duplicates(inplace=True)


In [None]:
# 4. Convert data types
data["InvoiceDate"] = pd.to_datetime(data["InvoiceDate"])

In [None]:
# 5. Feature engineering
data["Month"] = data["InvoiceDate"].dt.month
data["Hour"] = data["InvoiceDate"].dt.hour
data["Revenue"] = data["Quantity"] * data["UnitPrice"]

# 6. Handle outliers
q1, q3 = np.percentile(data["UnitPrice"], [25, 75])
iqr = q3 - q1
lower_bound, upper_bound = q1 - 1.5 * iqr, q3 + 1.5 * iqr
data = data[(data["UnitPrice"] >= lower_bound) & (data["UnitPrice"] <= upper_bound)]

# 7. Encode categorical variables
encoder = LabelEncoder()
data["Country"] = encoder.fit_transform(data["Country"])

# 8. Scale numerical features
scaler = StandardScaler()
data[["UnitPrice", "Revenue"]] = scaler.fit_transform(data[["UnitPrice", "Revenue"]])

In [None]:
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

print(f"Train size: {train_data.shape}, Test size: {test_data.shape}")

# 2. Univariate Analysis:


     Analyzing individual columns

In [None]:
data['UnitPrice'].hist(bins=20)
data['Country'].value_counts().plot(kind='bar')

     Plotting numerical distributions
     

In [None]:
import seaborn as sns
sns.boxplot(x=data['UnitPrice'])
