In [None]:
import pandas as pd
import sqlite3
import importlib
import numpy as np
from sklearn.linear_model import LinearRegression

# keeping company information in additional file
import data_file
importlib.reload(data_file)

In [372]:
key_bu = data_file.key_bu

In [104]:
source_file = 'data_files/source_data.xlsx'
# Convert the sheet to a Pandas dataframe
df_1 = pd.read_excel(source_file, sheet_name='1719')
df_2 = pd.read_excel(source_file, sheet_name='2023')

In [107]:
df = pd.concat([df_1, df_2])

In [130]:
conn = sqlite3.connect('data_files/ms_orders_data.db')
df.to_sql('codes', conn, index=False)
conn.close()

In [132]:
conn = sqlite3.connect('data_files/ms_orders_data.db')
query = "SELECT * FROM codes" 
df = pd.read_sql_query(query, conn)
conn.close()

In [134]:
df['sold_to_customer_n_latest'] = df['sold_to_customer_n_latest'].astype(str)

In [337]:
conn = sqlite3.connect('data_files/customer_data.db')
query = "SELECT * FROM customers"  # Replace 'tablename' with your table name
df_customers = pd.read_sql_query(query, conn)
conn.close()

In [452]:
len(df)

1754559

In [136]:
reserve = df.copy()

In [451]:
df = reserve.copy()

In [453]:
#exclusion lines with some specific companies
my_company_name = data_file.my_company_name
exclusion_company_one = data_file.exclusion_company_one
exclusion_company_two = data_file.exclusion_company_two
exclusion_company_three = data_file.exclusion_company_three

#necessary modifications of columns
df['bu'] = df['bu'].astype(str)
df['company_upper'] = df['sold_to_customer_n_latest'].str.upper()
# Filter out rows with unwanted value in column2
df = df[~df['company_upper'].str.contains(my_company_name) & ~df['company_upper'].str.contains(exclusion_company_one) & ~df['company_upper'].str.contains(exclusion_company_two) & ~df['company_upper'].str.contains(exclusion_company_three)] 

In [454]:
df['sold_to_customer'] = df['sold_to_customer'].astype(str)
df_customers['sold_to_customer'] = df_customers['sold_to_customer'].astype(str)

In [455]:
# Filter the dataframe based on conditions  delete rows where both columns = 0
df_light = df[(df['order_intake_quantity'] != 0) | (df['order_intake_euro'] != 0)]

In [456]:
df_cusmoters_for_merging = df_customers.loc[:, ['sold_to_customer','customer_name', 'indirect_direct', 'channel', 'type', 'tier_new', 'tier']]

df_merged = df_light.merge(df_cusmoters_for_merging, on='sold_to_customer', how='left')

In [457]:
df_merged = df_merged.loc[:, ['year_month', 'company_code_n', 'sales_order_so', 'sold_to_customer',
       'sold_to_customer_n_latest', 'bu', 'bu_n', 'material', 'material_n',
       'ms_code', 'order_intake_quantity', 'order_intake_euro', 'customer_name', 'indirect_direct', 'channel', 'type',
       'tier_new', 'tier']]

In [458]:
df_merged = df_merged[df_merged['bu'].isin(key_bu)]

In [459]:
df_channel = df_merged[df_merged['tier'] == 'Channel Partner']

In [462]:
temp_df = df_channel.copy()
#temp_df = df_merged.copy()

# Parse 'year_month' column and create 'FY' column. FY start from 4 month
temp_df['year_month_date'] = pd.to_datetime(temp_df['year_month'], format='%Y%m')
temp_df['FY'] = np.where(temp_df['year_month_date'].dt.month >= 4, temp_df['year_month_date'].dt.year, temp_df['year_month_date'].dt.year - 1)

# Create fiscal 'quarter' column
quarter_dict = {1: 'Q4', 2: 'Q4', 3: 'Q4', 4: 'Q1', 5: 'Q1', 6: 'Q1', 7: 'Q2', 8: 'Q2', 9: 'Q2', 10: 'Q3', 11: 'Q3', 12: 'Q3'}
temp_df['quarter'] = temp_df['year_month_date'].dt.month.map(quarter_dict)

# Create 'half_year' column
temp_df['half_year'] = np.where(temp_df['year_month_date'].dt.month.between(4, 9), 'HY1', 'HY2')

In [484]:
len(temp_df)

37619

In [464]:
# Splitting the 'material' column and keeping the part before '_0'
temp_df['material'] = temp_df['material'].str.split('_0', expand=True)[0]

# Delete rows where 'ms_code' contains text with 'BOP'
temp_df['ms_code'] = temp_df['ms_code'].astype(str)
temp_df = temp_df[~temp_df['ms_code'].str.contains('BOP')]

# Delete rows where 'ms_code' contains text with 'ENT'
temp_df = temp_df[~temp_df['ms_code'].str.contains('ENT')]

# Delete rows where 'ms_code' contains text with 'EXP'
temp_df = temp_df[~temp_df['ms_code'].str.contains('EXP')]

In [481]:
key_materials = temp_df.copy()

# Assuming you have a DataFrame called df with columns: bu, material, order_value

# Calculate the sum of orders for each material within each business unit
summary = key_materials.groupby(['bu', 'material'])['order_intake_euro'].sum().reset_index()

# Sort the materials within each business unit based on their sum values in descending order
summary = summary.sort_values(['bu', 'order_intake_euro'], ascending=[True, False])

# Calculate the cumulative sum of the sorted materials within each business unit
summary['cumulative_sum'] = summary.groupby('bu')['order_intake_euro'].cumsum()

# Calculate the threshold value that represents 95% of the total sum value within each business unit
summary['total_sum'] = summary.groupby('bu')['order_intake_euro'].transform('sum')
summary['threshold'] = summary['total_sum'] * 0.95

# Filter the materials within each business unit based on the threshold value
summary_filtered = summary[summary['cumulative_sum'] <= summary['threshold']]

#pivot_df_material_qty = pivot_df_material_qty.reset_index()
summary_filtered.to_excel('data_files/summary_filtered.xlsx', index=True)

In [482]:
more_valuable_materials = list(summary_filtered['material'].unique())

In [483]:
# Delete rows where 'ms_code' contains text with 'EXP'
temp_df = temp_df[temp_df['material'].isin(more_valuable_materials)]

In [322]:
temp_df['text_year'] = temp_df['year_month_date'].dt.year
temp_df['text_year'] = temp_df['text_year'].astype(str)
temp_df['year-quarter'] = temp_df['text_year'] + ' ' + temp_df['quarter']

In [485]:
temp_df = temp_df.loc[:, ['FY', 'customer_name', 'bu', 'material', 'ms_code', 'order_intake_quantity', 'order_intake_euro']]

In [439]:
temp_df = temp_df[temp_df['FY'] < 2023]

In [486]:
len(temp_df)

37619

In [496]:
# products codes quantities in Year periods
pivot_df_material_qty = temp_df.pivot_table(index=['customer_name', 'bu', 'material'], 
                          columns='FY', 
                          values='order_intake_quantity', 
                          aggfunc='sum',fill_value=0)

pivot_df_material_qty.reset_index(inplace=True)

In [441]:
# products codes quantities in Year periods
pivot_df_material_qty_no_customer = temp_df.pivot_table(index=['bu', 'material'], 
                          columns='FY', 
                          values='order_intake_quantity', 
                          aggfunc='sum',fill_value=0)

pivot_df_material_qty_no_customer.reset_index(inplace=True)

In [493]:
# products codes quantities in Year periods
pivot_df_qty = temp_df.pivot_table(index=['customer_name', 'bu', 'material', 'ms_code'], 
                          columns='FY', 
                          values='order_intake_quantity', 
                          aggfunc='sum')

# products codes orders sum in Year periods
pivot_df_sum = temp_df.pivot_table(index=['customer_name', 'bu', 'material', 'ms_code'], 
                          columns='FY', 
                          values='order_intake_euro', 
                          aggfunc='sum')

result_pivot = pivot_df_qty.join(pivot_df_sum, lsuffix='_qty', rsuffix='_eur')


# Resetting the index to make customer_name and material as normal columns
result_pivot.reset_index(inplace=True)

# Renaming the columns for clarity
result_pivot.columns.name = None

In [488]:
for year in years:
    qty_column = f'{year}_qty'
    eur_column = f'{year}_eur'
    
    result_pivot[eur_column] = result_pivot[eur_column] / result_pivot[qty_column]
    result_pivot[eur_column] = result_pivot[eur_column].round(2)
    result_pivot[eur_column] = result_pivot[eur_column].replace([np.inf, -np.inf], 0)

In [490]:
result_pivot = result_pivot.loc[:, ['customer_name', 'bu', 'material', 'ms_code','2017_eur',
       '2018_eur', '2019_eur', '2020_eur', '2021_eur', '2022_eur', '2023_eur']]

In [497]:
#pivot_df_material_qty = pivot_df_material_qty.reset_index()
pivot_df_material_qty.to_excel('data_files/result_pivot_material_qty.xlsx', index=True)

In [333]:
# with customer information

# convert the dataframe to long format
df_long_qty = pivot_df_material_qty.melt(id_vars=['customer_name', 'bu', 'material'], var_name='year', value_name='qty')

# convert year to integer
df_long_qty['year'] = df_long_qty['year'].astype(int)

# Initialize a dictionary to store the results
results = {}

# Group by customer_name and material and run a regression for each group
for (customer_name, material), group in df_long_qty.groupby(['customer_name', 'material']):
    # Skip this group if it only has one row
    if len(group) <= 1:
        continue
        
    group['qty'] = group['qty'].fillna(0)
    
    # Check if group is empty
    if group.empty:
        continue
    

    # Run the regression
    X = group['year'].values.reshape(-1,1)  # reshape is needed because we have only one feature
    y = group['qty']
    model = LinearRegression()
    model.fit(X, y)
    
    # Save the slope coefficient (i.e., trend) in the results
    results[(customer_name, material)] = model.coef_[0]

# Convert the results to a pandas DataFrame
trends = pd.DataFrame.from_dict(results, orient='index', columns=['trend'])

# Reset the index
trends.reset_index(inplace=True)

# Split the tuple into two separate columns
trends[['customer_name', 'material']] = pd.DataFrame(trends['index'].tolist(), index=trends.index)

# Drop the original 'index' column
trends.drop(columns=['index'], inplace=True)

# Set the column names (optional, the column names should be correct now)
# trends.columns = ['customer_name', 'material', 'trend']

# Sort the DataFrame by customer_name and material
trends = trends.sort_values(['customer_name', 'material'])

In [None]:
# without customers inforamtion

# convert the dataframe to long format
df_long_qty = pivot_df_material_qty.melt(id_vars=['customer_name', 'bu', 'material'], var_name='year', value_name='qty')

# convert year to integer
df_long_qty['year'] = df_long_qty['year'].astype(int)

# Initialize a dictionary to store the results
results = {}

# Group by customer_name and material and run a regression for each group
for (material, bu), group in df_long_qty.groupby(['material', 'bu']):
    # Skip this group if it only has one row
    if len(group) <= 1:
        continue
        
    group['qty'] = group['qty'].fillna(0)
    
    # Check if group is empty
    if group.empty:
        continue
    

    # Run the regression
    X = group['year'].values.reshape(-1,1)  # reshape is needed because we have only one feature
    y = group['qty']
    model = LinearRegression()
    model.fit(X, y)
    
    print(model.coef_)
    
    # Save the slope coefficient (i.e., trend) in the results
    results[(material, bu)] = model.coef_[0]

# Convert the results to a pandas DataFrame
trends = pd.DataFrame.from_dict(results, orient='index', columns=['trend'])

# Reset the index
trends.reset_index(inplace=True)

# Split the tuple into two separate columns
trends[['material', 'bu']] = pd.DataFrame(trends['index'].tolist(), index=trends.index)

# Drop the original 'index' column
trends.drop(columns=['index'], inplace=True)

# Set the column names (optional, the column names should be correct now)
# trends.columns = ['customer_name', 'material', 'trend']

# Sort the DataFrame by customer_name and material
trends = trends.sort_values(['material'])

In [None]:
# Convert the dataframe to long format
df_long_qty = pivot_df_material_qty_no_customer.melt(id_vars=['bu', 'material'], var_name='year', value_name='qty')

# Convert year to integer
df_long_qty['year'] = df_long_qty['year'].astype(int)

# Initialize a dictionary to store the results
results = {}

# Group by bu and material and run a regression for each group
for (bu, material), group in df_long_qty.groupby(['bu', 'material']):
    # Skip this group if it only has one row
    if len(group) <= 1:
        continue
    
    group['qty'] = group['qty'].fillna(0)
    
    # Check if group is empty
    if group.empty:
        continue
    
    # Run the regression
    X = group['year'].values.reshape(-1, 1)  # reshape is needed because we have only one feature
    y = group['qty']
    model = LinearRegression()
    model.fit(X, y)
    
    # Save the regression model in the results
    results[(bu, material)] = model

# Create a new dataframe for the forecast
forecast = pd.DataFrame(columns=['bu', 'material', 'year', 'qty'])

# Generate the forecast for each group
for (bu, material), model in results.items():
    # Get the last observed year
    last_year = df_long_qty[(df_long_qty['bu'] == bu) & (df_long_qty['material'] == material)]['year'].max()
    
    # Generate the forecast for 2023 and 2024
    forecast_year = pd.DataFrame({'year': [2023, 2024]})
    forecast_year['bu'] = bu
    forecast_year['material'] = material
    
    # Predict the values for the forecast years
    forecast_year['qty'] = model.predict(forecast_year['year'].values.reshape(-1, 1))
    
    # Append the forecast to the main dataframe
    forecast = forecast.append(forecast_year)

# Sort the forecast dataframe by bu and material
forecast.sort_values(['bu', 'material'], inplace=True)

# Reset the index of the forecast dataframe
forecast.reset_index(drop=True, inplace=True)

# Print the resulting forecast dataframe
print(forecast)


In [446]:
#pivot_df_material_qty = pivot_df_material_qty.reset_index()
forecast.to_excel('data_files/forecast_99.xlsx', index=True)

In [270]:
import statsmodels.api as sm

In [272]:
result_pivot.columns

Index(['customer_name', 'bu', 'material', 'ms_code', '2017_qty', '2018_qty',
       '2019_qty', '2020_qty', '2021_qty', '2022_qty', '2023_qty', '2017_eur',
       '2018_eur', '2019_eur', '2020_eur', '2021_eur', '2022_eur', '2023_eur'],
      dtype='object')

In [291]:
# Rename columns to fit stubnames pattern at the end
result_pivot = result_pivot.rename(columns=lambda x: x[-3:] + '_' + x[:-4] if 'qty' in x or 'eur' in x else x)

# Reshape the dataframe from wide to long
df_long = pd.wide_to_long(result_pivot, stubnames=['qty', 'eur'], i=['customer_name', 'bu', 'ms_code'], j='year', sep='_')

In [None]:
results = {}

# Group by customer_name, bu and ms_code and run a regression for each group
for (customer_name, bu, ms_code), group in df_long.groupby(['customer_name', 'bu', 'ms_code']):
    # Skip this group if it only has one row
    if len(group) <= 1:
        continue

    # Drop rows where 'eur' is NaN or inf
    group = group.replace([np.inf, -np.inf], np.nan)
    group = group.dropna(subset=['eur'])

    # Check if group is empty
    if group.empty:
        continue

    # Run the regression
    X = sm.add_constant(group['eur'])  # Add a constant (intercept term) to the predictors
    
    if 'const' not in X.columns:
        print(f"Constant not added properly for group: {customer_name}, {bu}, {ms_code}")
        continue
    
    
    y = group['qty']
    model = sm.OLS(y, X)
    try:
        result = model.fit()
        # Save the regression results
        results[(customer_name, bu, ms_code)] = result
    except ValueError:
        # Skip this group if regression cannot be performed (e.g., due to insufficient data after dropping rows)
        continue

In [None]:
# Initialize a list to store the results
results_list = []

# Loop over the results dictionary
for key, result in results.items():
    # Extract the required information
    try: 
        intercept = result.params['const']
    except:
        intercept = 0
    eur_coef = result.params['eur']
    r_squared = result.rsquared
    p_value = result.pvalues['eur']
    conf_int = result.conf_int().loc['eur']

    # Append the information to the results_list
    results_list.append([key[0], key[1], key[2], intercept, eur_coef, r_squared, p_value, conf_int[0], conf_int[1]])

# Create a DataFrame from the results_list
results_df = pd.DataFrame(results_list, columns=['customer_name', 'bu', 'ms_code', 'intercept', 'eur_coef', 'r_squared', 'p_value', 'conf_int_low', 'conf_int_high'])

# Export the DataFrame to an Excel file
results_df.to_excel('regression_results.xlsx', index=False)


In [184]:
writer = pd.ExcelWriter('data_files/test.xlsx')
pivot_df.to_excel(writer, sheet_name='material', index=True)
pivot_df_ms.to_excel(writer, sheet_name='mscodes', index=True)
pivot_df_ms_avg.to_excel(writer, sheet_name='avg', index=True)
writer.close()