### Import Dependencies

In [None]:
# Our Glorious Imports for data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

# Our Glorious Imports for Models 

import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Preprocess Robi Datathon 3.0 Dataset

In [None]:
purchase_df = pd.read_csv("./dataset/purchase.csv")  # Train SET
problem_df = pd.read_csv("./dataset/problem 2.csv")  # Test SET

# get the total unique MAGIC_KEYS in file
train_keys = np.unique(purchase_df["MAGIC_KEY"].to_list())
test_keys = np.unique(problem_df["MAGIC_KEY"].to_list())

# Debugging
print(f'Unique keys in train set: {len(train_keys)}')
print(f'Unique keys in test set: {len(test_keys)}')

In [None]:
# Check if the Magic Key is in out test set
temp_df = purchase_df[purchase_df['MAGIC_KEY'].isin(test_keys)]

In [None]:
# Data cleaning 
purchase_df = purchase_df[purchase_df['BOX_COUNT'] > 0]

In [None]:
# Parse  datetime column and set it as index
def parse_datetime(df):
    df['PURCHASE_DATE'] = pd.to_datetime(df['PURCHASE_DATE'], format="%d/%m/%Y")
    df.set_index('PURCHASE_DATE', inplace=True)
    return df

temp_df = parse_datetime(temp_df)
purchase_df = parse_datetime(purchase_df)

In [None]:
# Create Groups of timescale dataset
def slice_dataframe_by_dates(df):
    days_groups = []
    days_groups.append( df.query("PURCHASE_DATE >= '2018-10-01' and PURCHASE_DATE <= '2018-10-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-10-16' and PURCHASE_DATE <= '2018-10-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-11-01' and PURCHASE_DATE <= '2018-11-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-11-16' and PURCHASE_DATE <= '2018-11-30'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-12-01' and PURCHASE_DATE <= '2018-12-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-12-16' and PURCHASE_DATE <= '2018-12-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-01-01' and PURCHASE_DATE <= '2019-01-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-01-16' and PURCHASE_DATE <= '2019-01-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-02-01' and PURCHASE_DATE <= '2019-02-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-02-16' and PURCHASE_DATE <= '2019-02-28'") )
    return days_groups

# get smaller slices of my groups
test_purchase_groups = slice_dataframe_by_dates(temp_df)
all_purchase_groups = slice_dataframe_by_dates(purchase_df)


In [None]:
# Create a dataset from groups based on unique keys
def create_dataset_from_groups(unique_keys_list, days_groups):
    # Initialize an empty DataFrame
    new_dataset = pd.DataFrame()
    new_dataset['MAGIC_KEY'] = unique_keys_list

    # Iterate over each days group
    for days_groupname, days_group_data in enumerate(days_groups):
        # Check if the MAGIC_KEY is present in the days_group_data and convert to uint8
        new_dataset[f'col_{days_groupname}'] = new_dataset['MAGIC_KEY'].isin(days_group_data['MAGIC_KEY']).astype(np.uint8)
        
        # Initialize lists to store box counts and box IDs
        _box_counts = []
        _box_ids = []
        
        # Iterate over each row in the new dataset
        for idx, data in enumerate(new_dataset[f'col_{days_groupname}']):
            if data == 0:
                # If the MAGIC_KEY is not in the group, set box counts and IDs to 0
                _box_counts.append([0])
                _box_ids.append([0])
            else:
                # Otherwise, retrieve box counts and IDs from the days_group_data
                g = days_group_data[days_group_data['MAGIC_KEY'] == new_dataset.iloc[idx, 0]]
                _box_counts.append(g['BOX_COUNT'].to_list())
                _box_ids.append(g['BOX_ID'].to_list())
        
        # Add box counts and box IDs columns to the new dataset
        new_dataset[f'col_{days_groupname}_box_counts'] = _box_counts
        new_dataset[f'col_{days_groupname}_box_ids'] = _box_ids
    
    return new_dataset

# Assuming test_purchase_groups, temp_df, test_keys, and all_purchase_groups are defined elsewhere
days_groups = test_purchase_groups
dataset = temp_df

# Create test and train datasets
test_df = create_dataset_from_groups(test_keys, test_purchase_groups)
train_df = create_dataset_from_groups(train_keys, all_purchase_groups)

In [None]:
def most_common_in_both(list1, list2):
    # Count occurrences of each element in list1
    count1 = Counter(list1)
    # Filter the Counter to include only elements present in list2
    filtered_count = {k: v for k, v in count1.items() if k in list2}
    
    if not filtered_count:
        return None  # No common elements found
    
    # Get the most common element among the filtered elements
    most_common_element = max(filtered_count, key=filtered_count.get)
    return most_common_element


def process(row):
    choices = []
    for i in range(1, row.shape[0]):
        if 0 in row.iloc[i]:
            row.iloc[i] = 0
        else:
            if len(row.iloc[i]) == 1:
                row.iloc[i] = row.iloc[i][0]
                choices.append(row.iloc[i])
            else:
                choices += row.iloc[i]
                # row.iloc[i] = most_common_in_both(choices, row.iloc[i])
    for i in range(1, row.shape[0]):
        if type(row.iloc[i]) == list:
            row.iloc[i] = most_common_in_both(choices, row.iloc[i])
    # print(row)
    return row


# Renaming for our convenience 
def process_dataset(df):
    df = df.drop(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_0_box_counts', 'col_1_box_counts', 'col_2_box_counts', 'col_3_box_counts', 'col_4_box_counts', 'col_5_box_counts', 'col_6_box_counts', 'col_7_box_counts', 'col_8_box_counts', 'col_9_box_counts'], axis=1)
    df = df.apply(process, axis=1)
    
    col_name_maps = {
        'col_0_box_ids': '1st',
        'col_1_box_ids': '2nd',
        'col_2_box_ids': '3rd',
        'col_3_box_ids': '4th',
        'col_4_box_ids': '5th',
        'col_5_box_ids': '6th',
        'col_6_box_ids': '7th',
        'col_7_box_ids': '8th',
        'col_8_box_ids': '9th',
        'col_9_box_ids': '10th',}
    df = df.rename(col_name_maps, axis=1)
    
    return df


test_df = process_dataset(test_df)
train_df = process_dataset(train_df)


In [None]:
# Save the processed dataset
test_df.to_csv('data/problem-2-test.csv', index=False)
train_df.to_csv('data/problem-2-train.csv', index=False)

# Load Preprocessed dataset and Train Models

In [None]:
# Import Datasets
problem_df = pd.read_csv('dataset/problem 2.csv')
train_dataset = pd.read_csv('data/problem-2-train.csv')
test_dataset = pd.read_csv('data/problem-2-test.csv')
problem_df = test_dataset['MAGIC_KEY']

train_dataset.shape, test_dataset.shape

In [None]:
# Drop the 'MAGIC_KEY'
train_dataset.drop('MAGIC_KEY', axis=1, inplace=True)
test_dataset.drop('MAGIC_KEY', axis=1, inplace=True)

train_dataset.head(5)

In [None]:
# Filter last column's zero values as customers must buy something
train_dataset = train_dataset[train_dataset['10th'] != 0.0]
train_dataset.shape

In [None]:
# Filter all zero rows 
train_dataset = train_dataset[
    (train_dataset['1st'] != 0.0) |
    (train_dataset['2nd'] != 0.0) |
    (train_dataset['3rd'] != 0.0) |
    (train_dataset['4th'] != 0.0) |
    (train_dataset['5th'] != 0.0) |
    (train_dataset['6th'] != 0.0) |
    (train_dataset['7th'] != 0.0) |
    (train_dataset['8th'] != 0.0) |
    (train_dataset['9th'] != 0.0)]
train_dataset

#### One-Hot-Encoding

In [None]:
# Create Global One Hot Encoder
encoder = OneHotEncoder()

In [None]:
# One Hot Encoding
def one_hot_encoder(df, column):
    encoded_data = encoder.fit_transform(df[[column]])
    encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out([column]))
    return encoded_df

In [None]:
# One Hot Decoding
def one_hot_decoder(df, column):
    decoded_values = encoder.inverse_transform(df)
    decoded_df = pd.DataFrame(decoded_values, columns=[column])
    return decoded_df

In [None]:
# Create train dataset
def create_dataset(df, is_test = False):
    keys = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th']
    train_x = pd.DataFrame()
    train_y = pd.DataFrame()

    for key in keys:
        encoded_df = one_hot_encoder(df, key)
        train_x = pd.concat([train_x, encoded_df], axis=1)
    
    if not is_test:
        # train_y = one_hot_encoder(df, '10th')  # Not Use One Hot Encoding
        train_y = df['10th'].astype(int)
    
    return (train_x, train_y)

#### Splitting and Formatting DataFrame

In [None]:
# Dataset splitting function
def get_x_y_dataset(df, is_final_train=False, is_test=False, test_size=None):
    # Create features (X) and labels (Y) from the input DataFrame
    X, Y = create_dataset(df, is_test)
    
    if is_final_train:
        # return the entire dataset
        return (X, Y)
    
    # split the dataset into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
    
    return (x_train, x_test, y_train, y_test)


In [None]:
# Temporarily split the train set for understanding model performance
x_train, x_test, y_train, y_test = get_x_y_dataset(train_dataset.sample(2000), False, False, 0.2)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

#### Model Evaluation

In [None]:
# Import models


#### Without One Hot Encoding

In [None]:
# List of models with parameters for checking the best fit model 

model_params = {
    'svm':{
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1, 3, 5, 10, 20, 30, 50, 100, 500],
            'kernel': ['rbf']
        }
    },
    'random_forest':{
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10, 18, 20, 30],
            'n_jobs': [-1],
        }
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10],
            'max_iter': [1000],
        }
    },
    'decision_tree':{
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
        }
    },
    'multinomial_nb':{
        'model': MultinomialNB(), 
        'params': {
            
        }
    },
    'gaussian_nb':{
        'model': GaussianNB(), 
        'params': {
            
        }
    },
    'gradient-boosting':{
        'model': GradientBoostingClassifier(),
        'params': {
            'loss': ['log_loss'],
            'learning_rate': [0.1, 0.01, 0.001]
        }
    },
    'hist-gradient-boosting':{
        'model': HistGradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.1, 0.01, 0.001],
            'max_iter': [1000, 500]
        }
    },
}

In [None]:
# # Store model scores without one hot
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, verbose = 3)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
# Get the best results of the models
df_res = pd.DataFrame(scores,columns=['model', 'best_score', 'best_params'])
df_res

#### With One Hot Encoding

##### Parameter Searching

In [None]:
# List of models with parameters for checking the best fit model 

model_params = {
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10],
            'max_iter': [1000],
        }
    },
    'multinomial_nb':{
        'model': MultinomialNB(), 
        'params': {
            
        }
    },
    'gradient-boosting':{
        'model': GradientBoostingClassifier(),
        'params': {
            'loss': ['log_loss', 'exponential'],
            'learning_rate': [0.1, 0.01, 0.001]
        }
    },
    'hist-gradient-boosting':{
        'model': HistGradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.1, 0.01, 0.001],
            'max_iter': [1000, 500]
        }
    },
}

In [None]:
# # Store model scores with one hot
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

#### Predict and Save

In [None]:
# Manual checking model performence 
def fit_and_evaluate(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train, )
    result = model.score(x_test, y_test)
    print(result)

In [None]:
# Rolling algorithm for best match
def rolling_algorithm(df):
    df['1st'] = df['2nd']
    df['2nd'] = df['3rd']
    df['3rd'] = df['4th']
    df['4th'] = df['5th']
    df['5th'] = df['6th']
    df['6th'] = df['7th']
    df['7th'] = df['8th']
    df['8th'] = df['9th']
    df['9th'] = df['10th']
    df = df.drop('10th', axis=1)
    return df


In [None]:
# Store the predicted result in a CSV
def save_results(magic_keys, predicts, file_name):
    result = pd.DataFrame()
    # result = one_hot_decoder(predicts, 'BOX_ID')
    result['BOX_ID'] = predicts
    
    final_df = pd.concat([magic_keys, result], axis=1, )
    final_df['BOX_ID'] = final_df['BOX_ID'].astype(int)
    final_df.to_csv(f"./solution/{file_name}.csv", index=False)

In [None]:
# Fit the model, predict on the test set and store 
def fit_predict_and_save(model, x_train, y_train, x_test, magic_keys, file_name):
    model.fit(x_train, y_train)
    predicts = model.predict(x_test)
    pickle.dump(model, open(f"./model/{file_name}.pkl", 'wb'))
    save_results(magic_keys, predicts, file_name)
    print(f'File: {file_name} Stored Successfully!')

In [None]:
# Actual train-test splitting for final training and test data

#  Get features (X) and labels (Y) for final training
X, Y = get_x_y_dataset(train_dataset, True)

#Apply the rolling_algorithm to modify the test dataset
test_dataset = rolling_algorithm(test_dataset)

# Create a dataset from the modified test data
test_dataset, _ = create_dataset(test_dataset, True)

#  Identify common columns between X and the modified test dataset
common_columns = X.columns.intersection(test_dataset.columns)

#  Update X to include only the common columns
X = X[common_columns]

# Keep the modified test dataset unchanged
test_dataset = test_dataset


X.shape, Y.shape, test_dataset.shape

In [None]:
# Work on final model
model = RandomForestClassifier(n_estimators=18, n_jobs=-1)
fit_predict_and_save(model, X, Y, test_dataset, problem_df, "submission_random_forest")

In [None]:
# Work on final model
model = SVC(C=2, kernel='rbf', verbose = True)
fit_predict_and_save(model, X, Y, test_dataset, problem_df, "submission_svc")