In [41]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import numpy as np
from difflib import SequenceMatcher
import pandas as pd
from collections import defaultdict

df = pd.read_csv('data/dataset_full_refined.csv')
df['date'] = pd.to_datetime(df['date'])

In [42]:
df = df[df['count'] != 0]

In [43]:
def clean_text(text):
    """Basic text cleaning"""
    if pd.isna(text):  # Handle NaN values
        return ""
    return str(text).lower().strip()

def similar(a, b, threshold=0.85):
    """Check if two strings are similar using SequenceMatcher"""
    return SequenceMatcher(None, clean_text(a), clean_text(b)).ratio() > threshold

def find_similar_meals(df, column, threshold=0.85):
    """Find similar meal names in a given column"""
    unique_meals = df[column].dropna().unique()
    similar_meals = defaultdict(list)
    
    # Compare each pair of meals
    for i in range(len(unique_meals)):
        for j in range(i + 1, len(unique_meals)):
            if similar(unique_meals[i], unique_meals[j], threshold):
                similar_meals[unique_meals[i]].append(unique_meals[j])
    
    return similar_meals

columns_to_check = ['main_dish', 'vegetarian', 'soup']
similarity_results = {}

for column in columns_to_check:
    print(f"\nChecking {column}:")
    print("=" * 50)
    similar_meals = find_similar_meals(df, column)
    
    if similar_meals:
        for meal, similar_list in similar_meals.items():
            print(f"\nOriginal: {meal}")
            print("Similar to:")
            for similar_meal in similar_list:
                similarity = SequenceMatcher(None, clean_text(meal), clean_text(similar_meal)).ratio()
                print(f"- {similar_meal} (similarity: {similarity:.2f})")
    else:
        print("No similar meals found with current threshold")
    
    similarity_results[column] = similar_meals

# Count occurrences of each variant
def count_variants(df, column, similar_meals):
    variant_counts = {}
    for original, variants in similar_meals.items():
        variant_counts[original] = df[column].eq(original).sum()
        for variant in variants:
            variant_counts[variant] = df[column].eq(variant).sum()
    return variant_counts

print("\nOccurrence counts for each variant:")
print("=" * 50)
for column in columns_to_check:
    if similarity_results[column]:
        print(f"\n{column.upper()}:")
        counts = count_variants(df, column, similarity_results[column])
        for meal, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):
            print(f"{meal}: {count} occurrences")


Checking main_dish:

Original: meksika soslu tavuk
Similar to:
- meksika usulu tavuk (similarity: 0.89)

Original: icli kofte
Similar to:
- islim kofte (similarity: 0.86)

Original: pideli kofte
Similar to:
- pilic kofte (similarity: 0.87)

Original: etli patates oturtma
Similar to:
- patates oturtma (similarity: 0.86)

Original: mantar soslu tavuk sinitzel
Similar to:
- mantar soslu tavuk sote (similarity: 0.88)

Original: mantar soslu tavuk biftek
Similar to:
- mantar soslu tavuk sote (similarity: 0.88)

Original: kori soslu pilic
Similar to:
- kori soslu pilic sote (similarity: 0.86)

Original: mantar soslu tavuk sote
Similar to:
- mantarli tavuk sote (similarity: 0.86)
- mantar soslu tavuk (similarity: 0.88)

Checking vegetarian:

Original: zy.kereviz
Similar to:
- zy. kereviz (similarity: 0.95)

Original: zy.orbit fasulye
Similar to:
- zy. orbit fasulye (similarity: 0.97)

Original: zy.taze fasulye
Similar to:
- zy. taze fasulye (similarity: 0.97)
- taze fasulye (similarity: 0.89

In [44]:
def get_week_of_month(date):
    first_day = date.replace(day=1)
    day_of_month = date.day
    adjusted_day = day_of_month + first_day.weekday()
    return (adjusted_day - 1) // 7 + 1

df['date'] = pd.to_datetime(df['date'])

# Add day_of_week (0=Monday, 6=Sunday)
df['day_of_week'] = df['date'].dt.dayofweek

# Add week_of_month (1-5)
df['week_of_month'] = df['date'].apply(get_week_of_month)

# Add month number (1-12)
df['month'] = df['date'].dt.month

def create_dish_mappings():
    mappings = {
        'main_dish': {
            'meksika usulu tavuk': 'meksika soslu tavuk',
            'mantar soslu tavuk sote': 'mantar soslu tavuk',
            'mantarli tavuk sote': 'mantar soslu tavuk',
            'mantar soslu tavuk biftek': 'mantar soslu tavuk',
            'mantar soslu tavuk sinitzel': 'mantar soslu tavuk',            'etli patates oturtma': 'patates oturtma',
            'kori soslu pilic': 'kori soslu pilic sote',
        },
        'vegetarian': {
            'zy. kereviz': 'zy.kereviz',
            'zy. orbit fasulye': 'zy.orbit fasulye', 
            'taze fasulye': 'zy.taze fasulye',
            'zy. taze fasulye': 'zy.taze fasulye',
            'karnibahar yemegi': 'karnabahar yemegi',
            'imam bayildi': 'imambayildi',
            'zy. bamya': 'zy.bamya',
            'zy.patlican dolma': 'zy. patlican dolma'
        },
        'soup': {
            'sifa corba': 'safak corba',
            'krotonlu mercimek corba': 'krutonlu mercimek corba'
        }
    }
    return mappings

def standardize_dishes(df, mappings):
    df = df.copy()
    for category, mapping in mappings.items():
        df[category] = df[category].replace(mapping)
    
    return df

mappings = create_dish_mappings()
df = standardize_dishes(df, mappings)

In [45]:
encoded_data = df.copy()

encoded_data['normalized_count'] = encoded_data.groupby('campus')['count'].transform(lambda x: x / x.max())

def categorize_by_std(group):
    mean = group.mean()
    std = group.std()
    # Create categories based on standard deviations
    # We changed this part later on by converging lw and l to low, hw and h to high but we are not modifying this part since some of the codes following this part are dependent on this part.
    conditions = [
        (group < (mean - 1 *std)),  # Very Low: below -1 std
        (((mean - 1 *std) < group) & (group < (mean - 0.5*std))),  # Low: below -0.5 std
        (((mean - 0.5*std) < group) & (group < (mean + 0.5*std))),  # Medium: between -0.5 and 0.5 std
        (((mean + 0.5*std) < group) & (group < (mean + 1*std))),  # High: above 0.5 std
        (group > (mean + 1*std))  # Very High: above 1 std
    ]
    choices = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
    return np.select(conditions, choices, default='Medium')

encoded_data['density'] = encoded_data.groupby('campus')['normalized_count'].transform(categorize_by_std)

categorical_columns = ['campus', 'meal', 'soup', 'main_dish', 'vegetarian', 'side_dishes', 'dessert']
encoders = {}

for column in categorical_columns:
    encoders[column] = LabelEncoder()
    encoded_data[column] = encoders[column].fit_transform(df[column])

In [46]:
encoded_data

Unnamed: 0,date,day,campus,weather,count,meal,soup,main_dish,vegetarian,side_dishes,dessert,day_of_week,week_of_month,month,normalized_count,density
0,2024-01-01,monday,5,14.580000,420,1,44,0,5,68,239,0,1,1,0.180801,Low
1,2024-01-02,tuesday,5,12.630000,1626,1,0,101,62,24,226,1,1,1,0.699957,Very High
2,2024-01-03,wednesday,5,14.180000,1686,1,18,70,72,91,228,2,1,1,0.725786,Very High
3,2024-01-04,thursday,5,15.450000,1033,1,43,58,4,138,227,3,1,1,0.444684,Medium
4,2024-01-05,friday,5,14.110000,1360,1,32,4,1,116,44,4,1,1,0.585450,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293,2024-10-23,wednesday,2,17.138000,361,1,41,47,63,41,113,2,4,10,0.778017,Very High
2294,2024-10-24,thursday,2,16.713001,194,1,43,30,54,23,320,3,4,10,0.418103,Medium
2295,2024-10-25,friday,2,16.138000,159,1,26,16,31,106,111,4,4,10,0.342672,Medium
2296,2024-10-25,friday,4,14.290001,54,0,12,12,44,130,176,4,4,10,0.300000,Medium


In [47]:
for encoder in encoders:
    with open(f'encoders/{encoder}_encoder.txt', 'w') as f:
        for i, label in enumerate(encoders[encoder].classes_):
            f.write(f'{i}: {label}\n')

In [48]:
encoded_data.to_csv('data/encoded_data.csv', index=False)

In [None]:
# Assume your dataframe is named df
df = pd.read_csv("data/encoded_data.csv")

# Example lists of dates for each category
vacation_dates =  [
    "2024-01-13", "2024-01-14", "2024-01-15", "2024-01-16", "2024-01-17",
    "2024-01-18", "2024-01-19", "2024-01-20", "2024-01-21", "2024-01-22",
    "2024-01-23", "2024-01-24", "2024-01-25", "2024-01-26", "2024-01-27",
    "2024-01-28", "2024-01-29", "2024-01-30", "2024-01-31", "2024-02-01",
    "2024-02-02", "2024-02-03", "2024-02-04", "2024-02-05", "2024-02-06",
    "2024-02-07", "2024-02-08", "2024-02-09", "2024-02-10", "2024-02-11",
    "2024-09-01", "2024-09-02", "2024-09-03", "2024-09-04", "2024-09-05",
    "2024-09-06", "2024-09-07", "2024-09-08", "2024-09-09", "2024-09-10",
    "2024-09-11", "2024-09-12", "2024-09-13", "2024-09-14", "2024-09-15",
    "2024-09-16", "2024-09-17", "2024-09-18", "2024-09-19", "2024-09-20",
    "2024-09-21", "2024-09-22"
]

public_holiday_dates = [
    "2024-01-01", "2024-04-23", "2024-05-01", "2024-05-19", 
    "2024-07-15", "2024-08-30", "2024-10-29", "2024-05-27", "2024-07-24"
]
eid_dates = [
    "2024-04-09", "2024-04-10", "2024-04-11", "2024-04-12",
    "2024-06-15", "2024-06-16", "2024-06-17", "2024-06-18",
    "2024-06-19", "2024-06-20", "2024-06-21", "2024-06-22",
    "2024-06-23", "2024-06-24"
]
final_dates = [
    "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05", "2024-01-06",
    "2024-01-07", "2024-01-08", "2024-01-09", "2024-01-10", "2024-01-11",
    "2024-01-12", "2024-01-13", "2024-05-17", "2024-05-18", "2024-05-19",
    "2024-05-20", "2024-05-21", "2024-05-22", "2024-05-23", "2024-05-24",
    "2024-05-25", "2024-05-26", "2024-05-27", "2024-05-28", "2024-05-29",
    "2024-05-30", "2024-05-31", "2024-06-01", "2024-06-02", "2024-06-03",
    "2024-06-04", "2024-06-05", "2024-06-06", "2024-06-07", "2024-06-08"
]
# Function to determine status
def get_status(date, day_of_week):
    if date in vacation_dates:
        return 0  # Vacation
    elif date in eid_dates:
        return 1  # Eid
    elif date in public_holiday_dates:
        return 2  # Public Holiday
    elif date in final_dates:
        return 3  # Final Dates
    elif day_of_week in ['saturday', 'sunday']:
        return 4  # Weekend
    else:
        return 5  # Classes

# Apply the function to each row in the dataframe to create the 'status' column
df['status'] = df.apply(lambda row: get_status(row['date'], row['day']), axis=1)

# Save the dataframe or display the result
df.to_csv('data/dataset_with_status.csv', index=False)
print(df.head())  # Display the first few rows to verify