In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('transactions.csv')

# 1. Handle missing values
# Drop rows where both quantity and price are NaN
df = df.dropna(subset=['quantity', 'price'], how='all')

# Fill missing values
df['quantity'].fillna(df['quantity'].median(), inplace=True)
df['price'].fillna(df['price'].mean(), inplace=True)

# 2. Remove duplicate transactions
df.drop_duplicates(inplace=True)

# 3. Create a 'day_of_week' column
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.day_name()

# Output after cleaning
df.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Distribution of total sales per store
store_sales = df.groupby('store_id')['total_amount'].sum().sort_values()
plt.figure(figsize=(10, 6))
store_sales.plot(kind='bar')
plt.title('Total Sales per Store')
plt.xlabel('Store ID')
plt.ylabel('Total Sales')
plt.show()

# 2. Relationship between quantity and total_amount
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='quantity', y='total_amount')
plt.title('Relationship between Quantity and Total Amount')
plt.xlabel('Quantity')
plt.ylabel('Total Amount')
plt.show()

# 3. Monthly sales trends
df['month'] = df['date'].dt.to_period('M')
monthly_sales = df.groupby('month')['total_amount'].sum()
plt.figure(figsize=(10, 6))
monthly_sales.plot(kind='line')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# RFM analysis
df['recency'] = (df['date'].max() - df.groupby('customer_id')['date'].transform('max')).dt.days
df['frequency'] = df.groupby('customer_id')['transaction_id'].transform('count')
df['monetary'] = df.groupby('customer_id')['total_amount'].transform('sum')

rfm_df = df[['customer_id', 'recency', 'frequency', 'monetary']].drop_duplicates()

# Scaling
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_df[['recency', 'frequency', 'monetary']])

# K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm_df['segment'] = kmeans.fit_predict(rfm_scaled)

# Interpretation
rfm_df.groupby('segment').mean()


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# RFM analysis
df['recency'] = (df['date'].max() - df.groupby('customer_id')['date'].transform('max')).dt.days
df['frequency'] = df.groupby('customer_id')['transaction_id'].transform('count')
df['monetary'] = df.groupby('customer_id')['total_amount'].transform('sum')

rfm_df = df[['customer_id', 'recency', 'frequency', 'monetary']].drop_duplicates()

# Scaling
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_df[['recency', 'frequency', 'monetary']])

# K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm_df['segment'] = kmeans.fit_predict(rfm_scaled)

# Interpretation
rfm_df.groupby('segment').mean()


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare features and target variable
df['month'] = df['date'].dt.month
X = df[['store_id', 'month', 'quantity', 'price']]
y = df['total_amount']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_cv_score = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

# Gradient Boosting Model
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
gb_cv_score = cross_val_score(gb, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

# Choose the best model and evaluate on the test set
model = gb if abs(gb_cv_score) < abs(lr_cv_score) else lr
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

rmse, model


In [None]:
def allocate_stock(store_demand, current_stock, stock_limit):
    """
    Optimizes stock allocation across stores.

    Parameters:
        store_demand (DataFrame): A dataframe with 'store_id' and 'expected_demand' columns.
        current_stock (Series): Current stock available per store (indexed by store_id).
        stock_limit (int): Total stock available for allocation.

    Returns:
        allocation (Series): Optimized stock allocation per store.
    """
    # Sort stores by demand
    store_demand = store_demand.sort_values(by='expected_demand', ascending=False)
    allocation = pd.Series(0, index=store_demand['store_id'])
    
    for store in store_demand['store_id']:
        if stock_limit <= 0:
            break
        needed_stock = min(store_demand.loc[store, 'expected_demand'], stock_limit)
        allocation[store] = min(current_stock[store] + needed_stock, stock_limit)
        stock_limit -= needed_stock

    return allocation

# Example use of allocate_stock function
store_demand = pd.DataFrame({
    'store_id': range(1, 11),
    'expected_demand': np.random.randint(50, 200, 10)
})
current_stock = pd.Series(np.random.randint(10, 100, 10), index=store_demand['store_id'])
stock_limit = 500

allocation = allocate_stock(store_demand, current_stock, stock_limit)
allocation
