In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd

file_path = 'translated_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, encoding='utf-8')

# Remove '%' symbol and convert 'Open Rate' and 'Click Rate' to numeric
df['Open Rate'] = df['Open Rate'].str.rstrip('%').astype('float') / 100.0
df['Click Rate'] = df['Click Rate'].str.rstrip('%').astype('float') / 100.0

# features and target variables
text_features_column = 'Subjectline_English'
categorical_features = ['Region', 'Division', 'MAP_B&B', 'Email_Type']

# Feature Engineering
text_vectorizer = TfidfVectorizer(
    stop_words=None,
    sublinear_tf=True,
    max_features=1000
)

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine text and categorical features using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_vectorizer, text_features_column),
        ('cat', categorical_transformer, categorical_features)
    ])

# pipeline with preprocessing and model training for open rate
open_rate_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# pipeline with preprocessing and model training for click rate
click_rate_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# models for each target variable
open_rate_pipeline.fit(df.drop(['Open Rate', 'Click Rate'], axis=1), df['Open Rate'])
click_rate_pipeline.fit(df.drop(['Open Rate', 'Click Rate'], axis=1), df['Click Rate'])

# top keywords for each Division, Region, Email_Type, and MAP_B&B
unique_divisions = df['Division'].unique()
unique_regions = df['Region'].unique()
unique_mab_bnb = df['MAP_B&B'].unique()

for division in unique_divisions:
    for region in unique_regions:
        for mab_bnb in unique_mab_bnb:
            subset_df = df[(df['Division'] == division) & (df['Region'] == region) & (df['MAP_B&B'] == mab_bnb)]

            if not subset_df.empty:
                # features for the subset
                subset_features = subset_df.drop(['Open Rate', 'Click Rate'], axis=1)
                text_vectorizer.fit(subset_features['Subjectline_English'])

                # Predict for open rate
                open_rate_pipeline.predict(subset_features)

                # feature importance (coefficients) for text features
                feature_importance_text = open_rate_pipeline.named_steps['model'].feature_importances_

                # feature names for text features directly from the vocabulary
                feature_names_text = text_vectorizer.get_feature_names_out()
                vocabulary = text_vectorizer.vocabulary_

                # Ensure the number of features is not greater than the available vocabulary
                num_features = min(len(vocabulary), 1000)

                # Indices of the top keywords based on feature importance
                top_keywords_indices = feature_importance_text.argsort()[-num_features:][::-1]

                # actual top keywords, removing numbers and non-alphabetic characters
                top_keywords = [word for word, idx in sorted(vocabulary.items(), key=lambda x: x[1]) if idx in top_keywords_indices and word.isalpha()]

                # results or further analysis
                print(f"For Division '{division}', Region '{region}', and MAP_B&B '{mab_bnb}':")
                print(f"Top Keywords: {', '.join(top_keywords)}")
                print("\n")
