In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Correct column names
colnames = ['customer_code', 'Offer_id', 'Offer_subid', 'batch_id', 'product_category', 'product_sub_category', 'send_timestamp', 'open_timestamp']

# Load datasets with low_memory=False to handle mixed types warning
cdna = pd.read_csv('train_cdna_data.csv', low_memory=False)
train = pd.read_csv('train_action_history.csv', low_memory=False)
test_customers = pd.read_csv('test_cdna_data.csv', low_memory=False)
test_action_history = pd.read_csv('test_action_history.csv', names=colnames, low_memory=False)

cdna = cdna.rename(columns={'CUSTOMER_CODE': 'customer_code'})
test_customers = test_customers.rename(columns={'CUSTOMER_CODE': 'customer_code'})

# Inspect columns
print("CDNA columns:", cdna.columns)
print("Train columns:", train.columns)
print("Test Action History columns:", test_action_history.columns)

# Inspect the first few rows of test_action_history to identify any issues
print(test_action_history.head())

# Ensure the required columns are present
required_columns = ['send_timestamp', 'open_timestamp']
for col in required_columns:
    if col not in train.columns:
        raise KeyError(f"Column '{col}' not found in train dataset")
    if col not in test_action_history.columns:
        raise KeyError(f"Column '{col}' not found in test_action_history dataset")

# Preprocess timestamps with errors='coerce' to handle parsing errors
train['send_timestamp'] = pd.to_datetime(train['send_timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
train['open_timestamp'] = pd.to_datetime(train['open_timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
test_action_history['send_timestamp'] = pd.to_datetime(test_action_history['send_timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
test_action_history['open_timestamp'] = pd.to_datetime(test_action_history['open_timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Create time slots
def get_time_slot(dt):
    if pd.isnull(dt):
        return np.nan
    if dt.hour < 12:
        return 1
    elif dt.hour < 15:
        return 2
    elif dt.hour < 18:
        return 3
    else:
        return 4

train['send_slot'] = train['send_timestamp'].apply(get_time_slot)
train['open_slot'] = train['open_timestamp'].apply(lambda x: get_time_slot(x) if pd.notnull(x) else np.nan)

# Join cdna data with train and test_action_history
latest_cdna = cdna.sort_values('batch_date').groupby('customer_code').last().reset_index()
train = train.merge(latest_cdna, left_on='customer_code', right_on='customer_code', how='left')
test_action_history = test_action_history.merge(latest_cdna, left_on='customer_code', right_on='customer_code', how='left')

# Feature engineering
train['day_of_week'] = train['send_timestamp'].dt.dayofweek
train['hour_of_day'] = train['send_timestamp'].dt.hour
test_action_history['day_of_week'] = test_action_history['send_timestamp'].dt.dayofweek
test_action_history['hour_of_day'] = test_action_history['send_timestamp'].dt.hour

# Create target variable
train['target'] = train.apply(lambda row: 1 if row['send_slot'] == row['open_slot'] else 0, axis=1)

# Prepare training data
features = ['day_of_week', 'hour_of_day', 'send_slot'] + list(latest_cdna.columns[1:])
X = train[features]
y = train['target']

# Encode categorical features
categorical_features = X.select_dtypes(include=['object']).columns
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Train model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on validation set
X_val['predicted_prob'] = model.predict_proba(X_val)[:, 1]

# Rank time slots for each customer in validation set
def rank_slots(df):
    df = df.sort_values('predicted_prob', ascending=False)
    return df['send_slot'].tolist()

val_predictions = X_val.groupby('customer_code').apply(rank_slots).reset_index()
val_predictions.columns = ['customer_code', 'predicted_slots_order']

# Calculate accuracy
def get_top_slot(slots):
    return slots[0]

val_predictions['predicted_top_slot'] = val_predictions['predicted_slots_order'].apply(get_top_slot)
val_predictions = val_predictions.merge(train[['customer_code', 'send_slot']], on='customer_code', how='left')
accuracy = accuracy_score(val_predictions['send_slot'], val_predictions['predicted_top_slot'])

print(f'Validation Accuracy: {accuracy:.4f}')