In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus
from jdatetime import datetime
from scipy.stats import ttest_ind
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import FeatureHasher
import re
file_dir = "sales_data.csv"
file_dir2 = "dollar_price.csv"
file_dir3 = "country-cities-data.csv"
pd.set_option("display.max_colwidth", None)

In [2]:
# li = df["Order_Date"].str.split("-").str[0]
# li.unique()

# test = df[df["Model_Name"] == "Vostro 3568"]
# test = test[
#     [
#         # "Manufacturer",
#         "Model_Name",
#         # "Category",
#         "Screen_Size",
#         "Screen",
#         "CPU",
#         "GPU",
#         # "RAM",
#         "Storage",
#         # "OS",
#         # "OS_Version",
#         "Weight",
#     ]
# ]
# test = test.drop_duplicates()

# df_products[df_products["Model_Name"] == "Vostro 3568"]

In [4]:
df = pd.read_csv(file_dir, low_memory=False)
df["Screen_Size"] = df["Screen_Size"].str.replace('"', "").astype(float)
df["RAM"] = df["RAM"].str.replace("GB", "").astype(int)
df["Weight"] = df["Weight"].replace(["kgs", "kg"], "", regex=True).astype(float)

df["Date_Shamsi"] = df["Order_Date"].copy()
df["Order_Date"] = df["Order_Date"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d").togregorian()
)

df_dollar_price = pd.read_csv(file_dir2, index_col=[0])
df_dollar_price["miladi"] = pd.to_datetime(df_dollar_price["miladi"])

df = pd.merge(
    df,
    df_dollar_price,
    how="left",
    left_on="Order_Date",
    right_on="miladi",
)
df.drop(columns=["miladi", "shamsi"], inplace=True)
df.rename(columns={"close_price": "Dollar_Price"}, inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: 'sales_data.csv'

In [None]:
df_temp = df.copy()
df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]

sales_by_city = df_temp.groupby("Branch")["Total_Price"].sum()

profit_by_city = df_temp.groupby("Branch")["Profit"].mean()

# محاسبه نسبت تخفیف به تعداد فروخته‌شده بر اساس شهر
discount_to_sales_ratio = df_temp.groupby("Branch").apply(
    lambda x: x["Discount"].sum() / x["Quantity"].sum()
)

# محاسبه نسبت قیمت به سود بر اساس شهر
price_to_profit_ratio = df_temp.groupby("Branch").apply(
    lambda x: x["Total_Price"].sum() / x["Profit"].sum()
)

# محاسبه نسبت سفارشات فوری بر اساس شهر
priority_orders_ratio = (
    df_temp[df_temp["Order_Priority"] == "H"]
    .groupby("Branch")["Order_Priority"]
    .count()
    / df_temp.groupby("Branch")["Order_Priority"].count()
)

city_metrics = pd.DataFrame(
    {
        "Sales": sales_by_city,
        "Discount_to_Sales_Ratio": discount_to_sales_ratio,
        "Price_to_Profit_Ratio": price_to_profit_ratio,
        "Priority_Orders_Ratio": priority_orders_ratio,
        "Profitability": profit_by_city,
    }
)

city_metrics = city_metrics.sort_values(by=["Profitability"], ascending=False)
city_metrics

In [None]:
df_pupulation = pd.read_csv(file_dir3)
df_pupulation.loc[15, "city"] = "Hamedan"

df_branchs = pd.DataFrame(columns=["Branch", "Sum_Sell", "Total_Price", "Profit"])
df_temp = df.copy()

df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]

for branch in df_temp["Branch"].unique():
    temp = df_temp[df_temp["Branch"] == branch]
    df_branchs.loc[len(df_branchs.index) + 1] = {
        "Branch": branch,
        # "Sum_Sell": len(temp),
        "Sum_Sell": temp["Quantity"].sum(),
        "Total_Price": temp["Total_Price"].sum().round(2),
        "Profit": temp["Profit"].sum().round(2),
    }

df_branchs["Profit_key"] = df_branchs["Profit"] / df_branchs["Total_Price"]

df_branchs = pd.merge(
    df_branchs,
    df_pupulation,
    how="left",
    left_on="Branch",
    right_on="city",
)
df_branchs = df_branchs.drop(columns=["city", "country", "latitude", "longitude"])

weight_profit = 0.4
weight_margin = 0.3
weight_sales = 0.2
weight_population = 0.1

df_branchs["Weighted_Score"] = (
    df_branchs["Profit"] * weight_profit
    + df_branchs["Profit_key"] * weight_margin
    + df_branchs["Sum_Sell"] * weight_sales
    + df_branchs["pop2023"] * weight_population
)

df_branchs = df_branchs.sort_values(by=["Weighted_Score"], ascending=False)
df_branchs

In [None]:
df_temp = df.copy()
df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]

df_with_discount = df_temp[df_temp["Discount"] > 0]["Total_Price"]  # ['Quantity']
df_without_discount = df_temp[df_temp["Discount"] == 0]["Total_Price"]  # ['Quantity']

t_stat, p_value = ttest_ind(df_with_discount, df_without_discount, equal_var=False)

alpha = 0.05
if p_value < alpha:
    print("تخفیف بر میزان فروش تأثیر دارد")
else:
    print("تخفیف بر میزان فروش تأثیر ندارد")

In [None]:
# pd.Grouper(key="Order_Date", freq="Y")
df_temp = df[df["Discount"] != 0].copy()
df_temp["Discount"] = df_temp["Discount"] * df_temp["Quantity"]
result = (
    df_temp.groupby(df_temp["Order_Date"].dt.year)
    .agg({"Discount": "mean", "Quantity": "sum"})
    .reset_index()
)
result = result.sort_values(by=["Quantity"], ascending=False)
result

In [None]:
df_temp = df.copy()
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]

X = sm.add_constant(df_temp["Discount"])
y = df_temp["Profit"]
model = sm.OLS(y, X).fit()
# print(model.summary())

print("Degrees of Freedom: ", model.df_resid)
print("t_stat: ", model.tvalues["Discount"])
print("p-value: ", model.pvalues["Discount"])

alpha = 0.05
if model.pvalues["Discount"] < alpha:
    print("تخفیف بر میزان سود تأثیر دارد")
else:
    print("تخفیف بر میزان سود تأثیر ندارد")

df_temp = df[df["Discount"] != 0].copy()
df_temp["Discount"] = df_temp["Discount"] * df_temp["Quantity"]
result = (
    df_temp.groupby(df_temp["Order_Date"].dt.year)
    .agg({"Discount": "mean"})
    .reset_index()
)
df_temp = df.copy()
df_temp["Profit"] = (df_temp["Profit"] / df_temp["Dollar_Price"]).round(2)
result2 = (
    df_temp.groupby(df_temp["Order_Date"].dt.year).agg({"Profit": "sum"}).reset_index()
)
df_temp = pd.merge(
    result,
    result2,
    how="left",
    left_on="Order_Date",
    right_on="Order_Date",
)
df_temp = df_temp.sort_values(by=["Profit"], ascending=False)
df_temp

df_temp = df.copy()
# df_temp["Total_Price"] = df_temp["Total_Price"] / df_temp["Dollar_Price"]
df_temp["Profit"] = df_temp["Profit"] / df_temp["Dollar_Price"]
df_temp["Price"] = df_temp["Price"] / df_temp["Dollar_Price"]

brands = df_temp["Manufacturer"].unique()
categories = df_temp["Category"].unique()
capital_allocation = pd.DataFrame(index=brands, columns=categories)
# model = LinearRegression()

for brand in brands:
    for category in categories:
        filtered_data = df_temp[
            (df_temp["Manufacturer"] == brand) & (df_temp["Category"] == category)
        ]

        features, target = 0, 0
        if len(filtered_data) > 0:
            # features = filtered_data[['Price']].values.reshape(-1, 1)
            target = filtered_data["Profit"].values

        # model.fit(features, target)
        # پیش‌بینی سود بر اساس قیمت
        # predicted_profits = model.predict(features)
        # تخصیص سرمایه بهینه بر اساس میانگین سود پیش‌بینی شده
        optimal_capital = np.max(target)
        capital_allocation.at[brand, category] = optimal_capital

capital_allocation["sum_profits_per_manufacturer"] = capital_allocation.sum(axis=1)
capital_allocation.loc["sum_profits_per_category"] = capital_allocation.sum(axis=0)
capital_allocation

result_df = (
    df.assign(Profit_Per_Dollar=(df["Profit"] / df["Dollar_Price"]).round(2))
    .groupby("Manufacturer")
    .agg(Profit_Sum=("Profit_Per_Dollar", "sum"))
    .sort_values(by=["Profit_Sum"], ascending=False)
    .reset_index()
)
result_df

In [None]:
result_df = (
    df.assign(Profit_Per_Dollar=(df["Profit"] / df["Dollar_Price"]).round(2))
    .groupby("Category")
    .agg(Profit_Sum=("Profit_Per_Dollar", "sum"))
    .sort_values(by=["Profit_Sum"], ascending=False)
    .reset_index()
)
result_df

In [None]:


df_temp = df.copy()

cpu_regex = r'(\d+(\.\d+)?)GHz'
storage_regex = r'(\d+)(?=[GB|TB])'

df_encoded = pd.get_dummies(df_temp, columns=['OS', 'CPU', 'GPU'], prefix=['OS', 'CPU', 'GPU'])
df_encoded['Storage_Size'] = df_temp['Storage'].str.extract(storage_regex).astype(float)
selected_columns = df_encoded.filter(regex='^CPU_|^GPU_|^OS_|^storage_', axis=1)
selected_columns = selected_columns.loc[:, selected_columns.dtypes == bool]
selected_columns = selected_columns.astype(int)
target = df_temp["Price"] / df_temp["Dollar_Price"]
X_train, X_test, y_train, y_test = train_test_split(selected_columns, target, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = model.score(X_test, y_test)
print("Model R-squared score:", score)



In [None]:
# Enter the following values to connect to the database
user = "root"
password = "khb!1mes2@K-pAsS3#zorie$"
host = "localhost"
port = 3306
database = "project2"

engine = create_engine(
    url="mysql+pymysql://{0}:%s@{1}/{2}".format(user, host, database)
    % quote_plus(password)
)

In [None]:
query = """
SELECT
    M.Name AS Manufacturer,
    P.Name,
    C.Name AS Category,
    R.Size AS RAM,
    S.Model AS Storage,
    Sp.Weight,
    CPU_M.Name AS CPU_Manufacturer,
    CPU.Model AS CPU_Model,
    CPU.Frequency AS CPU_Freq,
    GPU_M.Name AS GPU_Manufacturer,
    GPU.Model AS GPU_Model,
    OS.Name AS OS_Name,
    OS.Version AS OS_Version,
    Sc.Size AS Screen_Size,
    Sc.Resolution AS Screen_Resolution,
    Sc.Type AS Screen_Type,
#     Prc.Price,
#     Prc.Dollar_Price,
#     Prc.Price / Prc.Dollar_Price AS Real_Price,
    AVG(Prc.Price / Prc.Dollar_Price) OVER (PARTITION BY P.ID) AS Average_Real_Price
FROM
    Prices Prc
JOIN
    Products P ON Prc.Product_ID = P.ID
LEFT JOIN
    Manufacturers M ON P.Manufacturer_ID = M.ID
LEFT JOIN
    Categories C ON P.Category_ID = C.ID
LEFT JOIN
    Specs Sp ON P.Spec_ID = Sp.ID
LEFT JOIN
    RAMs R ON Sp.RAM_ID = R.ID
LEFT JOIN
    Storages S ON Sp.Storage_ID = S.ID
LEFT JOIN
    CPUs CPU ON Sp.CPU_ID = CPU.ID
LEFT JOIN
    Manufacturers CPU_M ON CPU.Manufacturer_ID = CPU_M.ID
LEFT JOIN
    GPUs GPU ON Sp.GPU_ID = GPU.ID
LEFT JOIN
    Manufacturers GPU_M ON GPU.Manufacturer_ID = GPU_M.ID
LEFT JOIN
    OSs OS ON Sp.OS_ID = OS.ID
LEFT JOIN
    Screens Sc ON Sp.Screen_ID = Sc.ID
group by P.ID;
"""

df = pd.read_sql_query(query, engine)
df

query = """
SELECT
    P.Discount,
    P.Quantity,
    O.Date AS Order_Date,
    P.Profit,
    P.Dollar_Price,
    M.Name AS Manufacturer,
    C.Name AS Category
FROM
    Orders O
JOIN
    Prices P ON O.Price_ID = P.ID
JOIN
    Products Pr ON P.Product_ID = Pr.ID
JOIN
    Manufacturers M ON Pr.Manufacturer_ID = M.ID
JOIN
    Categories C ON Pr.Category_ID = C.ID;
"""

df = pd.read_sql_query(query, engine)
df