In [2]:
import pandas as pd
import numpy as np

!mkdir ../data/preprocessed
!mkdir ../Algorithms

df = pd.read_csv("../data/start-data.csv")

# Data Preprocessing

## Feature Engineering

### Analysis of the Category Names to Extract Item Types

In [None]:
# Initializing an empty set to store unique categories
unique_categories = set()
# An empty dictionary to store the count of each category
unique_categories_counts = {}
# An empty dictionary to store the collection times for each category
category_collection_times = {}

'''
# This function processes a single row from a DataFrame and updates dictionaries
# tracking unique category counts and collection times.

# Arguments:
#   row (pandas.Series): A row from a DataFrame containing at least:
#       - 'category_names' (str): Comma-separated string of category names.
#       - 'collection_duration' (float): Collection duration for the item.
#   unique_categories_counts (dict): A dictionary to store counts of unique categories (key: category name, value: count).
#   category_collection_times (dict): A dictionary to store collection times for each category (key: category name, value: list of collection durations).
'''

def process_row(row, unique_categories_counts, category_collection_times):
    if pd.notna(row['category_names']):   
        categories = row['category_names'].split(',')      
        for category in categories:
            category = category.strip()
            unique_categories_counts[category] = unique_categories_counts.get(category, 0) + 1

            if category not in category_collection_times:
                category_collection_times[category] = [row['collection_duration']]
            else:
                category_collection_times[category].append(row['collection_duration'])

In [None]:
# Iterate through each row of the DataFrame
for index, row in df.iterrows():
   process_row(row, unique_categories_counts, category_collection_times)

# Convert the dictionary to a DataFrame
unique_categories_counts_df = pd.DataFrame(list(unique_categories_counts.items()),
                                           columns=['Unique Categories', 'Count'])

# Sort the DataFrame by the count in descending order and give them rankings
unique_categories_counts_df['Rank'] = unique_categories_counts_df['Count'].rank(ascending=False)
unique_categories_counts_df.sort_values(by='Rank', inplace=True)
unique_categories_counts_df = unique_categories_counts_df.drop(columns='Count')

# Calculate the average collection time for each unique category
average_collection_time = {}
for category, times in category_collection_times.items():
    average_collection_time[category] = sum(times) / len(times)


# Add a new column in unique_categories_counts_df to store the average collection time
unique_categories_counts_df['average_collection_time'] = unique_categories_counts_df['Unique Categories'].map(average_collection_time)

In [None]:
unique_categories_counts_df

### Creating New Features From Category Names

In [3]:
cold_item_types = [
                    "Gazlı İçecek",
                    "Ayran & Kefir",
                    "Süt",
                    "Peynir",
                    "Migros Dondurma",
                    "Yoğurt",
                    "Tek Dondurma",
                    "Tereyağ & Margarin",
                    "Kap Dondurma",
                    "Meyve Suyu",
                    "Soğuk Kahve",
                    "Enerji İçeceği",
                    "Tereyağ",
                    "Şalgam"
                    ]

frozen_item_types = [
                    "Beyaz Et",
                    "Kırmızı Et",
                    "Et Şarküteri",
                    "Pizza",
                    "Mantı",
                    "Dondurulmuş Gıda",
                    "Sakatat",
                    "Donuk Fırın",
                    "Dondurulmuş",
                    "Balık",
                    "Buz"
                    ]

scalable_item_types = [
                      "Sebze",
                      "Meyve",
                      "Yeşillik",
                      ]

In [5]:
'''
# This function checks if any item types are present in the given category names.

# Arguments:
#   category_names (str): A comma-separated string containing category names.
#   item_types (iterable): An iterable collection of item types to search for.

# Returns:
#   int: The number of item types found in the category names.
'''
def contains_items(category_names, item_types):
    if isinstance(category_names, str):
        categories = category_names.split(',')
        item_count = 0
        for category in categories:
            if any(item.strip() in item_types for item in category.split(',')):
                item_count += 1
        return item_count
    else:
        return 0

In [6]:
# Apply the function to create the frozen_item column
df['frozen_item'] = df['category_names'].apply(lambda x: contains_items(x, frozen_item_types))
print("Frozen item column added to df")

# Apply the function to create the cold_item column
df['cold_item'] = df['category_names'].apply(lambda x: contains_items(x, cold_item_types))
print("Cold item column added to df")

# Apply the function to create the scalable_item column
df['scalable_item'] = df['category_names'].apply(lambda x: contains_items(x, scalable_item_types))
print("Scalable item column added to df")

Frozen item column added to df
Cold item column added to df
Scalable item column added to df


In [7]:
df

Unnamed: 0,packet_date,order_id,package_id,collection_duration,item_count,is_vip,category_names,units,cold_item,frozen_item,scalable_item
0,2023-01-30 16:25:56.531971 UTC,231986020,66538886,0.000000,21,False,"Meyve,Sebze,Sebze,Peynir,Zeytin,Bakliyat,Mutfa...","GRAM,GRAM,GRAM,GRAM,GRAM,PIECE,PIECE,PIECE,PIE...",0,0,6
1,2023-01-13 17:39:56.963085 UTC,226345108,64279854,17.366667,17,False,"Sebze,Konserve & Turşu,Meyve,Bakliyat,Meyve,Me...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",0,0,11
2,2023-12-16 13:39:03.726051 UTC,355151082,113977388,39.616667,17,False,"Ayran & Kefir,Kuruyemiş,Yumurta,Bar Çikolata,K...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",2,1,1
3,2023-02-11 13:37:56.262404 UTC,235698212,68008610,0.000000,19,False,"Kedi,Meyve,Kedi,Beyaz Et,Kraker,Bisküvi,Şekerl...","PIECE,GRAM,PIECE,GRAM,PIECE,PIECE,PIECE,PIECE,...",0,1,2
4,2023-01-04 10:36:28.288366 UTC,223149961,63047012,0.000000,12,False,"Sebze,Et Şarküteri,Sütlü Tatlı, Krema,Mutfak E...","GRAM,PIECE,PIECE,PIECE,PIECE,GRAM,GRAM,GRAM,PI...",0,2,4
...,...,...,...,...,...,...,...,...,...,...,...
1438813,2023-05-14 22:03:55.03007 UTC,267250768,80626839,24.733333,17,False,"Sebze,Kuruyemiş,Et Şarküteri,Meyve,Meyve,Sebze...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",0,1,8
1438814,2023-05-02 09:03:42.533162 UTC,262597520,78757438,10.733333,17,False,"Peynir,Meyve Suyu,Ekmek,Peynir,Peynir,Meyve Su...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",2,1,0
1438815,2023-02-13 15:23:39.449954 UTC,236659316,68303121,8.816667,17,False,"Et Şarküteri,Sebze,Çay,Peynir,Bulaşık,Kağıt,Ma...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",1,4,2
1438816,2023-03-26 17:25:06.829762 UTC,250406920,73772482,13.550000,17,False,"Unlu Mamüller,Sebze,Ekmek,Yeşillik,Bulaşık,Yeş...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",1,1,8


### Creating New Features From Packet Dates

In [8]:
# Convert the 'packet_date' column to datetime
df['packet_date'] = pd.to_datetime(df['packet_date'], format='%Y-%m-%d %H:%M:%S.%f %Z')

# Extract the month, day of the week, and hour from the 'packet_date' column
df['month'] = df['packet_date'].dt.month
df['day'] = df['packet_date'].dt.dayofweek
df['hour'] = df['packet_date'].dt.hour

print("Columns month, day, and hour added to df")

Columns month, day, and hour added to df


In [9]:
# check if the packet_date column contains holidays
import holidays

tr_holidays = holidays.Turkey()
df['is_holiday'] = df['packet_date'].apply(lambda x: 1 if x in tr_holidays else 0)
print("is_holiday column added to order_date.csv.")
#check how many holiday orders are there in the dataset
df.drop(columns=['packet_date'], inplace=True)
print("Number of holidays in the dataset: ", df['is_holiday'].sum())

is_holiday column added to order_date.csv.
Number of holidays in the dataset:  45145


In [10]:
print("Number of holidays in the dataset: ", df['is_vip'].sum())

Number of holidays in the dataset:  4423


In [11]:
df

Unnamed: 0,order_id,package_id,collection_duration,item_count,is_vip,category_names,units,cold_item,frozen_item,scalable_item,month,day,hour,is_holiday
0,231986020,66538886,0.000000,21,False,"Meyve,Sebze,Sebze,Peynir,Zeytin,Bakliyat,Mutfa...","GRAM,GRAM,GRAM,GRAM,GRAM,PIECE,PIECE,PIECE,PIE...",0,0,6,1,0,16,0
1,226345108,64279854,17.366667,17,False,"Sebze,Konserve & Turşu,Meyve,Bakliyat,Meyve,Me...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",0,0,11,1,4,17,0
2,355151082,113977388,39.616667,17,False,"Ayran & Kefir,Kuruyemiş,Yumurta,Bar Çikolata,K...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",2,1,1,12,5,13,0
3,235698212,68008610,0.000000,19,False,"Kedi,Meyve,Kedi,Beyaz Et,Kraker,Bisküvi,Şekerl...","PIECE,GRAM,PIECE,GRAM,PIECE,PIECE,PIECE,PIECE,...",0,1,2,2,5,13,0
4,223149961,63047012,0.000000,12,False,"Sebze,Et Şarküteri,Sütlü Tatlı, Krema,Mutfak E...","GRAM,PIECE,PIECE,PIECE,PIECE,GRAM,GRAM,GRAM,PI...",0,2,4,1,2,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438813,267250768,80626839,24.733333,17,False,"Sebze,Kuruyemiş,Et Şarküteri,Meyve,Meyve,Sebze...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",0,1,8,5,6,22,0
1438814,262597520,78757438,10.733333,17,False,"Peynir,Meyve Suyu,Ekmek,Peynir,Peynir,Meyve Su...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",2,1,0,5,1,9,0
1438815,236659316,68303121,8.816667,17,False,"Et Şarküteri,Sebze,Çay,Peynir,Bulaşık,Kağıt,Ma...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",1,4,2,2,0,15,0
1438816,250406920,73772482,13.550000,17,False,"Unlu Mamüller,Sebze,Ekmek,Yeşillik,Bulaşık,Yeş...","PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIECE,PIEC...",1,1,8,3,6,17,0
