#### Import Dependencies

In [None]:
# Our Glorious Imports for data manipulation
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Our Glorious Imports for Models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Preprocess Robi Datathon 3.0 Dataset

In [None]:
purchase_df = pd.read_csv("./dataset/purchase.csv") # Train SET
problem_df = pd.read_csv("./dataset/problem 1.csv") # Test SET

# get the total unique MAGIC_KEYS in file
train_keys = np.unique(purchase_df["MAGIC_KEY"].to_list())
test_keys = np.unique(problem_df["MAGIC_KEY"].to_list())

# Debugging
print(f'Unique keys in train set: {len(train_keys)}')
print(f'Unique keys in test set: {len(test_keys)}')

In [None]:
# Check if the Magic Key is in out test set
temp_df = purchase_df[purchase_df['MAGIC_KEY'].isin(test_keys)]

In [None]:
# Data cleaning 
purchase_df = purchase_df[purchase_df['BOX_COUNT'] > 0]

In [None]:
# Parse  datetime column and set it as index
def parse_datetime(df):
    df['PURCHASE_DATE'] = pd.to_datetime(df['PURCHASE_DATE'], format="%d/%m/%Y")
    df.set_index('PURCHASE_DATE', inplace=True)
    return df

# more housekeeping
temp_df = parse_datetime(temp_df)
purchase_df = parse_datetime(purchase_df)

In [None]:
# Create Groups of timescale dataset
def slice_dataframe_by_dates(df):
    days_groups = []
    days_groups.append( df.query("PURCHASE_DATE >= '2018-10-01' and PURCHASE_DATE <= '2018-10-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-10-16' and PURCHASE_DATE <= '2018-10-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-11-01' and PURCHASE_DATE <= '2018-11-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-11-16' and PURCHASE_DATE <= '2018-11-30'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-12-01' and PURCHASE_DATE <= '2018-12-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-12-16' and PURCHASE_DATE <= '2018-12-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-01-01' and PURCHASE_DATE <= '2019-01-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-01-16' and PURCHASE_DATE <= '2019-01-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-02-01' and PURCHASE_DATE <= '2019-02-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-02-16' and PURCHASE_DATE <= '2019-02-28'") )
    return days_groups

# get smaller slices of my groups
test_purchase_groups = slice_dataframe_by_dates(temp_df)
all_purchase_groups = slice_dataframe_by_dates(purchase_df)


In [None]:
# from the small slices make the main set
def create_dataset_from_groups(unique_keys_list, days_groups):
    new_dataset = pd.DataFrame()
    new_dataset['MAGIC_KEY'] = unique_keys_list

    for days_groupname, days_group_data in enumerate(days_groups):
        # print("Group:", days_groupname)
        new_dataset[f'col_{days_groupname}'] = new_dataset['MAGIC_KEY'].isin(days_group_data['MAGIC_KEY']).astype(np.uint8)
    return new_dataset

# Assuming test_purchase_groups, temp_df, test_keys, and all_purchase_groups are defined
days_groups = test_purchase_groups
dataset = temp_df

# Create test and train datasets

test_df = create_dataset_from_groups(test_keys, test_purchase_groups)
train_df = create_dataset_from_groups(train_keys, all_purchase_groups)

In [None]:
# Renaming For conviniyence 
def process_dataset(df):
    col_name_maps = {
        'col_0': '1st',
        'col_1': '2nd',
        'col_2': '3rd',
        'col_3': '4th',
        'col_4': '5th',
        'col_5': '6th',
        'col_6': '7th',
        'col_7': '8th',
        'col_8': '9th',
        'col_9': '10th',}
    df = df.rename(col_name_maps, axis=1)
    
    return df

# make train-test set
test_df = process_dataset(test_df)
train_df = process_dataset(train_df)

In [None]:
# Save the processed dataset
test_df.to_csv('data/problem-1-test.csv', index=False)
train_df.to_csv('data/problem-1-train.csv', index=False)

# Load Preprocessed dataset and Train Models

In [None]:
# Import Datasets
problem_df = pd.read_csv('./dataset/problem 1.csv')
train_dataset = pd.read_csv('./preprocessed_dataset/preprocessed_train_dataset.csv')
test_dataset = pd.read_csv('./preprocessed_dataset/preprocessed_test_dataset.csv')

train_dataset.shape, test_dataset.shape

In [None]:
# Drop the 'MAGIC_KEY'
train_dataset.drop('MAGIC_KEY', axis=1, inplace=True)
magic_keys = list(test_dataset['MAGIC_KEY'])
test_dataset.drop('MAGIC_KEY', axis=1, inplace=True)

train_dataset.head(3)

#### View Correlation Heatmap

In [None]:
def show_heatmap(df):
    # Calculate the correlation matrix
    corr_matrix = df.corr()
    # Create a heatmap plot
    plt.figure(figsize=(10, 10))
    sns.heatmap(corr_matrix, annot=True)  # Annotate cells with correlation values
    plt.title('Correlation Heatmap')  # Set the title for the plot
    plt.show()  # Display the heatmap

In [None]:
# Show heatmap for better understanding
show_heatmap(train_dataset)

#### Splitting and Formatting DataFrame

In [None]:
# Split in train and test format
def get_train_test(df, test_size = None):
    Y = df['10th']
    X = df.drop('10th', axis=1,)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
    return (x_train, x_test, y_train, y_test)

In [None]:
# Temporarily split the train set for understanding model perfoemance
x_train, x_test, y_train, y_test = get_train_test(train_dataset.sample(20000), 0.2)

x_train.shape, x_test.shape

#### Parameter Searching

In [None]:
# Import models



In [None]:
# List of models with parameters for checking the best fit model

model_params = {
    'svm':{
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1,10,20, 30, 50, 100, 500],
            'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
        }
    },
    'random_forest':{
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10, 18, 20, 30],
            'n_jobs': [-1],
        }
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10],
            'n_jobs': [-1],
            'max_iter': [1000],
        }
    },
    'decision_tree':{
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
        }
    },
    'multinomial_nb':{
        'model': MultinomialNB(), 
        'params': {
            
        }
    },
    'gaussian_nb':{
        'model': GaussianNB(), 
        'params': {
            
        }
    },
    'gradient-boosting':{
        'model': GradientBoostingClassifier(),
        'params': {
            'loss': ['log_loss', 'exponential'],
            'learning_rate': [0.1, 0.01, 0.001]
        }
    },
    'hist-gradient-boosting':{
        'model': HistGradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.1, 0.01, 0.001],
            'max_iter': [1000, 500]
        }
    },
}

In [None]:
# Store model scores
scores = []

# Find out Our best fitting Model 
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
# Get the best results of the models
df_res = pd.DataFrame(scores,columns=['model', 'best_score', 'best_params'])
df_res

#### Predict Using the Best Model and Save

In [None]:
# Manual checking model performence 
def fit_and_evaluate(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    # Get scores on the test data
    result = model.score(x_test, y_test)
    print(result)

In [None]:
# Rolling algorithm for best match
def rolling_algorithm(df):
    df['1st'] = df['2nd']
    df['2nd'] = df['3rd']
    df['3rd'] = df['4th']
    df['4th'] = df['5th']
    df['5th'] = df['6th']
    df['6th'] = df['7th']
    df['7th'] = df['8th']
    df['8th'] = df['9th']
    df['9th'] = df['10th']
    df = df.drop('10th', axis=1)
    return df


In [None]:
# Store the predicted result in a CSV
def save_results(magic_keys, predicts, file_name):
    # Create a dictionary to map predictions (0 or 1) to corresponding labels ("N" or "Y")
    replacements = {0: "N", 1: "Y"}
    
    # Convert the numeric predictions to labels using the dictionary
    data = [replacements.get(x) for x in predicts]

    # Create a DataFrame with MAGIC_KEY and PURCHASE columns
    final_df = pd.DataFrame({
        'MAGIC_KEY': magic_keys,
        'PURCHASE': data
    })

    # Save the DataFrame to a CSV file
    final_df.to_csv(file_name, index=False)

In [None]:
# Fit the model, predict on the test set, and store results
def fit_predict_and_save(model, x_train, y_train, test_ds, X_test, Y_test, magic_keys, file_name):
    # Fit the model on the training data
    model.fit(x_train, y_train)
    
    # Predict on the test dataset
    predicts = model.predict(test_ds)
    
    # Calculate predictions on the X_test data
    y_hats = model.predict(X_test)
    
    # Print the test score (accuracy)
    print(f'Test Score: {accuracy_score(Y_test, y_hats)}')
    
    # Save the results to a CSV file
    save_results(magic_keys, predicts, file_name)
    
    # Print a success message
    print(f'File: {file_name} Stored Successfully!')

In [None]:
# Training dataset for final training  
def get_final_train_data(df):
    Y = df['9th']
    X = df.drop(['10th', '9th'], axis=1,)
    return (X,Y)

In [None]:
test_X = test_dataset.iloc[:, :8].to_numpy()
test_Y = test_dataset.iloc[:, 8].to_numpy()

test_dataset1 = test_dataset.iloc[:, 2:10].to_numpy()

In [None]:
test_dataset1

In [None]:
# Actual train test splitting for final training and test data
X, Y = get_final_train_data(train_dataset)
# test_dataset = rolling_algorithm(test_dataset)



X.shape, test_dataset1.shape

In [None]:
# Work on final model
model = RandomForestClassifier(n_estimators=18, max_depth=15, n_jobs=-1, verbose=1)
fit_predict_and_save(model, X, Y, test_dataset1, test_X, test_Y, magic_keys, "./solution/submission_random_forest.csv")