In [20]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus
from jdatetime import datetime
from scipy.stats import ttest_ind
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.ensemble import RandomForestRegressor
import re
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
file_dir = "sales_data.csv"
file_dir2 = "dollar_price.csv"
file_dir3 = "country-cities-data.csv"
pd.set_option("display.max_colwidth", None)

In [None]:
# li = df["Order_Date"].str.split("-").str[0]
# li.unique()

# test = df[df["Model_Name"] == "Vostro 3568"]
# test = test[
#     [
#         # "Manufacturer",
#         "Model_Name",
#         # "Category",
#         "Screen_Size",
#         "Screen",
#         "CPU",
#         "GPU",
#         # "RAM",
#         "Storage",
#         # "OS",
#         # "OS_Version",
#         "Weight",
#     ]
# ]
# test = test.drop_duplicates()

# df_products[df_products["Model_Name"] == "Vostro 3568"]

## Create DataFrame

In [7]:
df = pd.read_csv(file_dir, low_memory=False)
df["Screen_Size"] = df["Screen_Size"].str.replace('"', "").astype(float)
df["RAM"] = df["RAM"].str.replace("GB", "").astype(int)
df["Weight"] = df["Weight"].replace(["kgs", "kg"], "", regex=True).astype(float)

df["Date_Shamsi"] = df["Order_Date"].copy()
df["Order_Date"] = df["Order_Date"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d").togregorian()
)

df_dollar_price = pd.read_csv(file_dir2, index_col=[0])
df_dollar_price["miladi"] = pd.to_datetime(df_dollar_price["miladi"])

df = pd.merge(
    df,
    df_dollar_price,
    how="left",
    left_on="Order_Date",
    right_on="miladi",
)
df.drop(columns=["miladi", "shamsi"], inplace=True)
df.rename(columns={"close_price": "Dollar_Price"}, inplace=True)

## Req 1

In [3]:
df_temp = df.copy()
df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]

sales_by_city = df_temp.groupby("Branch")["Total_Price"].sum()

profit_by_city = df_temp.groupby("Branch")["Profit"].mean()

# محاسبه نسبت تخفیف به تعداد فروخته‌شده بر اساس شهر
discount_to_sales_ratio = df_temp.groupby("Branch").apply(
    lambda x: x["Discount"].sum() / x["Quantity"].sum()
)

# محاسبه نسبت قیمت به سود بر اساس شهر
price_to_profit_ratio = df_temp.groupby("Branch").apply(
    lambda x: x["Total_Price"].sum() / x["Profit"].sum()
)

# محاسبه نسبت سفارشات فوری بر اساس شهر
priority_orders_ratio = (
    df_temp[df_temp["Order_Priority"] == "H"]
    .groupby("Branch")["Order_Priority"]
    .count()
    / df_temp.groupby("Branch")["Order_Priority"].count()
)

city_metrics = pd.DataFrame(
    {
        "Sales": sales_by_city,
        "Discount_to_Sales_Ratio": discount_to_sales_ratio,
        "Price_to_Profit_Ratio": price_to_profit_ratio,
        "Priority_Orders_Ratio": priority_orders_ratio,
        "Profitability": profit_by_city,
    }
)

city_metrics = city_metrics.sort_values(by=["Profitability"], ascending=False)
city_metrics

Unnamed: 0_level_0,Sales,Discount_to_Sales_Ratio,Price_to_Profit_Ratio,Priority_Orders_Ratio,Profitability
Branch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kermanshah,57099460.0,0.539443,10.084548,0.150008,111.230445
Rasht,56964590.0,0.544192,10.085701,0.151639,111.070674
Tehran,170659700.0,0.543006,10.083088,0.149961,110.961857
Shiraz,57027530.0,0.551514,10.099013,0.152712,110.854983
Mashhad,113722500.0,0.544973,10.089404,0.151853,110.848092
Zanjan,56668480.0,0.539405,10.091168,0.149893,110.814812
Isfahan,113601600.0,0.54691,10.090982,0.150534,110.697659
Arak,56909780.0,0.546443,10.098198,0.151743,110.687169
Hamedan,56993700.0,0.544877,10.1004,0.147144,110.676225
Yazd,56651020.0,0.546776,10.084814,0.150546,110.662668


In [None]:
df_pupulation = pd.read_csv(file_dir3)
df_pupulation.loc[15, "city"] = "Hamedan"

df_branchs = pd.DataFrame(columns=["Branch", "Sum_Sell", "Total_Price", "Profit"])
df_temp = df.copy()

df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]

for branch in df_temp["Branch"].unique():
    temp = df_temp[df_temp["Branch"] == branch]
    df_branchs.loc[len(df_branchs.index) + 1] = {
        "Branch": branch,
        # "Sum_Sell": len(temp),
        "Sum_Sell": temp["Quantity"].sum(),
        "Total_Price": temp["Total_Price"].sum().round(2),
        "Profit": temp["Profit"].sum().round(2),
    }

df_branchs["Profit_key"] = df_branchs["Profit"] / df_branchs["Total_Price"]

df_branchs = pd.merge(
    df_branchs,
    df_pupulation,
    how="left",
    left_on="Branch",
    right_on="city",
)
df_branchs = df_branchs.drop(columns=["city", "country", "latitude", "longitude"])

weight_profit = 0.4
weight_margin = 0.3
weight_sales = 0.2
weight_population = 0.1

df_branchs["Weighted_Score"] = (
    df_branchs["Profit"] * weight_profit
    + df_branchs["Profit_key"] * weight_margin
    + df_branchs["Sum_Sell"] * weight_sales
    + df_branchs["pop2023"] * weight_population
)

df_branchs = df_branchs.sort_values(by=["Weighted_Score"], ascending=False)
df_branchs

## Req 2

In [32]:
df_temp = df.copy()
df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]

df_with_discount = df_temp[df_temp["Discount"] > 0]["Total_Price"]  # ['Quantity']
df_without_discount = df_temp[df_temp["Discount"] == 0]["Total_Price"]  # ['Quantity']

t_stat, p_value = ttest_ind(df_with_discount, df_without_discount, equal_var=False)

alpha = 0.05
if p_value < alpha:
    print("تخفیف بر میزان فروش تأثیر دارد")
else:
    print("تخفیف بر میزان فروش تأثیر ندارد")

تخفیف بر میزان فروش تأثیر دارد (میانگین میزان فروش با تخفیف نسبت به بدون تخفیف متفاوت است)


In [43]:
# pd.Grouper(key="Order_Date", freq="Y")
df_temp = df[df["Discount"] != 0].copy()
df_temp["Discount"] = df_temp["Discount"] * df_temp["Quantity"]
result = (
    df_temp.groupby(df_temp["Order_Date"].dt.year)
    .agg({"Discount": "mean", "Quantity": "sum"})
    .reset_index()
)
result = result.sort_values(by=["Quantity"], ascending=False)
result

Unnamed: 0,Order_Date,Discount,Quantity
2,2013-12-31,4.475702,14468
3,2014-12-31,4.466426,14425
4,2015-12-31,4.421999,14405
6,2017-12-31,4.400785,14344
7,2018-12-31,4.413529,14251
9,2020-12-31,4.434722,14248
1,2012-12-31,4.399534,14192
10,2021-12-31,4.43371,14178
8,2019-12-31,4.398627,14166
11,2022-12-31,4.453935,14146


## Req 3

In [42]:
df_temp = df.copy()
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]

X = sm.add_constant(df_temp["Discount"])
y = df_temp["Profit"]
model = sm.OLS(y, X).fit()
# print(model.summary())

print("Degrees of Freedom: ", model.df_resid)
print("t_stat: ", model.tvalues["Discount"])
print("p-value: ", model.pvalues["Discount"])

alpha = 0.05
if model.pvalues["Discount"] < alpha:
    print("تخفیف بر میزان سود تأثیر دارد")
else:
    print("تخفیف بر میزان سود تأثیر ندارد")

Degrees of Freedom:  1017079.0
t_stat:  -30.75367833795924
p-value:  1.3608305742414363e-207
تخفیف بر میزان سود تأثیر دارد


In [45]:
df_temp = df[df["Discount"] != 0].copy()
df_temp["Discount"] = df_temp["Discount"] * df_temp["Quantity"]
result = (
    df_temp.groupby(df_temp["Order_Date"].dt.year)
    .agg({"Discount": "mean"})
    .reset_index()
)
df_temp = df.copy()
df_temp["Profit"] = (df_temp["Profit"] / df_temp["Dollar_Price"]).round(2)
result2 = (
    df_temp.groupby(df_temp["Order_Date"].dt.year).agg({"Profit": "sum"}).reset_index()
)
df_temp = pd.merge(
    result,
    result2,
    how="left",
    left_on="Order_Date",
    right_on="Order_Date",
)
df_temp = df_temp.sort_values(by=["Profit"], ascending=False)
df_temp

Unnamed: 0,Order_Date,Discount,Profit
10,2021,4.43371,9613412.88
9,2020,4.434722,9607354.08
8,2019,4.398627,9575305.23
7,2018,4.413529,9522492.03
11,2022,4.453935,9470163.59
4,2015,4.421999,9469229.91
6,2017,4.400785,9463916.91
5,2016,4.39906,9462892.6
3,2014,4.466426,9446382.98
2,2013,4.475702,9405244.16


## Req 4

In [75]:
df_temp = df.copy()
# df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]
df_temp["Price"] = df_temp["Price"] / df_temp["Dollar_Price"]

brands = df_temp["Manufacturer"].unique()
categories = df_temp["Category"].unique()
capital_allocation = pd.DataFrame(index=brands, columns=categories)
# model = LinearRegression()

for brand in brands:
    for category in categories:
        filtered_data = df_temp[
            (df_temp["Manufacturer"] == brand) & (df_temp["Category"] == category)
        ]

        features, target = 0, 0
        if len(filtered_data) > 0:
            # features = filtered_data[['Price']].values.reshape(-1, 1)
            target = filtered_data["Profit"].values

        # model.fit(features, target)
        # پیش‌بینی سود بر اساس قیمت
        # predicted_profits = model.predict(features)
        # تخصیص سرمایه بهینه بر اساس میانگین سود پیش‌بینی شده
        optimal_capital = np.max(target)
        capital_allocation.at[brand, category] = optimal_capital

capital_allocation["sum_profits_per_manufacturer"] = capital_allocation.sum(axis=1)
capital_allocation.loc["sum_profits_per_category"] = capital_allocation.sum(axis=0)
capital_allocation

Unnamed: 0,Notebook,2 in 1 Convertible,Ultrabook,Gaming,Workstation,Netbook,sum_profits_per_manufacturer
Dell,897.914194,960.905055,878.099174,1251.364927,1039.576366,314.729504,5342.589219
Lenovo,1748.971193,1041.912633,1029.802302,1236.59757,1136.842105,623.076923,6817.202726
Acer,548.211769,476.006491,473.47133,709.068234,0.0,227.823369,2434.581193
HP,800.5997,908.693434,980.276715,754.658863,1526.136065,799.256506,5769.621283
Fujitsu,362.053162,0.0,0.0,0.0,0.0,0.0,362.053162
Asus,741.802988,657.915058,741.176471,1358.096344,0.0,76.097956,3575.088816
MSI,0.0,0.0,0.0,1277.283528,0.0,0.0,1277.283528
Toshiba,792.109256,0.0,1141.566265,0.0,0.0,0.0,1933.675522
Apple,0.0,0.0,907.696241,0.0,0.0,0.0,907.696241
Samsung,923.694779,649.132948,885.90604,0.0,0.0,93.617021,2552.350789


In [None]:
result_df = (
    df.assign(Profit_Per_Dollar=(df["Profit"] / df["Dollar_Price"]).round(2))
    .groupby("Manufacturer")
    .agg(Profit_Sum=("Profit_Per_Dollar", "sum"))
    .sort_values(by=["Profit_Sum"], ascending=False)
    .reset_index()
)
result_df

In [None]:
result_df = (
    df.assign(Profit_Per_Dollar=(df["Profit"] / df["Dollar_Price"]).round(2))
    .groupby("Category")
    .agg(Profit_Sum=("Profit_Per_Dollar", "sum"))
    .sort_values(by=["Profit_Sum"], ascending=False)
    .reset_index()
)
result_df

## Req 5
TODO

In [9]:
df_temp = df.copy()

cpu_regex = r'(\d+(\.\d+)?)GHz'
storage_regex = r'(\d+)(?=[GB|TB])'

df_encoded = pd.get_dummies(df_temp, columns=['OS', 'CPU', 'GPU'], prefix=['OS', 'CPU', 'GPU'])
df_encoded['Storage_Size'] = df_temp['Storage'].str.extract(storage_regex).astype(float)
selected_columns = df_encoded.filter(regex='^CPU_|^GPU_|^OS_|^storage_', axis=1)
selected_columns = selected_columns.loc[:, selected_columns.dtypes == bool]
selected_columns = selected_columns.astype(int)
target = df_temp["Price"] / df_temp["Dollar_Price"]
X_train, X_test, y_train, y_test = train_test_split(selected_columns, target, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = model.score(X_test, y_test)
print("Model R-squared score:", score)

Model R-squared score: 0.6292543668661598


## Req 6

In [28]:
# miladi :
df_temp = df.copy()
df_temp['Month'] = df_temp['Order_Date'].dt.month
monthly_sales = df_temp.groupby('Month')['Total_Price'].sum()
statistic, p_value = f_oneway(*[group['Total_Price'] for name, group in df_temp.groupby('Month')])
alpha = 0.05
yearly_budget = 1
allocation = [1.0] * 12
if p_value < alpha:
    print("There is a statistically significant difference in sales among different months.")
    tukey_results = pairwise_tukeyhsd(df_temp['Total_Price'], df_temp['Month'])
    if tukey_results.reject.any():
        significant_pairs = tukey_results.summary().data[1:]
        print("Significant pairwise comparisons:")
        for pair in significant_pairs:
            month1, month2 = int(pair[0]), int(pair[1])
            mean_diff = pair[3]
            allocation[month1 - 1] += mean_diff / 2
            allocation[month2 - 1] -= mean_diff / 2
    allocation = [budget * yearly_budget / sum(allocation) for budget in allocation]
    month_names = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
    print("Monthly Budget Allocation:")
    for month, budget in zip(month_names, allocation):
        print(f"{month}: {budget:.2f}")
else:
    print("There is no statistically significant difference in sales among different months.")




There is a statistically significant difference in sales among different months.
Significant pairwise comparisons:
Monthly Budget Allocation:
January: 0.08
February: 0.12
March: 0.08
April: 0.16
May: 0.08
June: 0.00
July: 0.12
August: 0.08
September: 0.08
October: 0.04
November: 0.04
December: 0.08


In [33]:
# shamsi :
# not complete
df_temp = df.copy()
df_temp['Date_Shamsi'].astype(str)
start_pos = 5
end_pos = 7
pattern = r"-" * (start_pos - 1) + r"(\d+)" + r"-" * (end_pos - start_pos - 1)
df_temp['Month'] = re.search(pattern, df_temp['Date_Shamsi'])
monthly_sales = df_temp.groupby('Month')['Total_Price'].sum()
statistic, p_value = f_oneway(*[group['Total_Price'] for name, group in df_temp.groupby('Month')])
alpha = 0.05
yearly_budget = 1
allocation = [1.0] * 12
if p_value < alpha:
    print("There is a statistically significant difference in sales among different months.")
    tukey_results = pairwise_tukeyhsd(df_temp['Total_Price'], df_temp['Month'])
    if tukey_results.reject.any():
        significant_pairs = tukey_results.summary().data[1:]
        print("Significant pairwise comparisons:")
        for pair in significant_pairs:
            month1, month2 = int(pair[0]), int(pair[1])
            mean_diff = pair[3]
            allocation[month1 - 1] += mean_diff / 2
            allocation[month2 - 1] -= mean_diff / 2
    allocation = [budget * yearly_budget / sum(allocation) for budget in allocation]
    month_names = ['فروردین', 'اردیبهشت', 'خرداد', 'تیر', 'مرداد', 'شهریور',
               'مهر', 'آبان', 'آذر', 'دی', 'بهمن', 'اسفند']
    print("Monthly Budget Allocation:")
    for month, budget in zip(month_names, allocation):
        print(f"{month}: {budget:.2f}")
else:
    print("There is no statistically significant difference in sales among different months.")

TypeError: expected string or bytes-like object, got 'Series'

## DataBase

In [None]:
# Enter the following values to connect to the database
user = "root"
password = "khb!1mes2@K-pAsS3#zorie$"
host = "localhost"
port = 3306
database = "project2"

engine = create_engine(
    url="mysql+pymysql://{0}:%s@{1}/{2}".format(user, host, database)
    % quote_plus(password)
)

In [None]:
query = """
SELECT
    M.Name AS Manufacturer,
    P.Name,
    C.Name AS Category,
    R.Size AS RAM,
    S.Model AS Storage,
    Sp.Weight,
    CPU_M.Name AS CPU_Manufacturer,
    CPU.Model AS CPU_Model,
    CPU.Frequency AS CPU_Freq,
    GPU_M.Name AS GPU_Manufacturer,
    GPU.Model AS GPU_Model,
    OS.Name AS OS_Name,
    OS.Version AS OS_Version,
    Sc.Size AS Screen_Size,
    Sc.Resolution AS Screen_Resolution,
    Sc.Type AS Screen_Type,
#     Prc.Price,
#     Prc.Dollar_Price,
#     Prc.Price / Prc.Dollar_Price AS Real_Price,
    AVG(Prc.Price / Prc.Dollar_Price) OVER (PARTITION BY P.ID) AS Average_Real_Price
FROM
    Prices Prc
JOIN
    Products P ON Prc.Product_ID = P.ID
LEFT JOIN
    Manufacturers M ON P.Manufacturer_ID = M.ID
LEFT JOIN
    Categories C ON P.Category_ID = C.ID
LEFT JOIN
    Specs Sp ON P.Spec_ID = Sp.ID
LEFT JOIN
    RAMs R ON Sp.RAM_ID = R.ID
LEFT JOIN
    Storages S ON Sp.Storage_ID = S.ID
LEFT JOIN
    CPUs CPU ON Sp.CPU_ID = CPU.ID
LEFT JOIN
    Manufacturers CPU_M ON CPU.Manufacturer_ID = CPU_M.ID
LEFT JOIN
    GPUs GPU ON Sp.GPU_ID = GPU.ID
LEFT JOIN
    Manufacturers GPU_M ON GPU.Manufacturer_ID = GPU_M.ID
LEFT JOIN
    OSs OS ON Sp.OS_ID = OS.ID
LEFT JOIN
    Screens Sc ON Sp.Screen_ID = Sc.ID
group by P.ID;
"""

df = pd.read_sql_query(query, engine)
df

In [None]:
query = """
SELECT
    P.Discount,
    P.Quantity,
    O.Date AS Order_Date,
    P.Profit,
    P.Dollar_Price,
    M.Name AS Manufacturer,
    C.Name AS Category
FROM
    Orders O
JOIN
    Prices P ON O.Price_ID = P.ID
JOIN
    Products Pr ON P.Product_ID = Pr.ID
JOIN
    Manufacturers M ON Pr.Manufacturer_ID = M.ID
JOIN
    Categories C ON Pr.Category_ID = C.ID;
"""

df = pd.read_sql_query(query, engine)
df