In [None]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
import pickle

# Preprocessing the dataset
def preprocess_data(data):
    features = [
        'category', 'tags', 'location', 'cuisine_type', 'attraction_type', 
        'family_friendly', 'star_rating', 'best_time_to_visit'
    ]
    data['combined_features'] = data[features].fillna('').apply(
        lambda x: ' '.join(x.map(str).str.lower().map(lambda s: re.sub(r'\W+', ' ', s))),
        axis=1
    )
    return data

# Creating TF-IDF matrix for text features
def create_tfidf_matrix(data):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data['combined_features'])
    return tfidf_matrix, vectorizer

# Preparing data for training (text and numerical)
def prepare_ann_data(data, tfidf_matrix):
    if 'target_column' not in data.columns:
        data['target_column'] = np.random.randint(0, 2, size=len(data))
    y = data['target_column']

    # Select numerical features
    numerical_features = ['rating', 'reviews_count', 'popularity_score']
    numerical_data = data[numerical_features].fillna(0).to_numpy()

    # Split data
    return train_test_split(tfidf_matrix.todense(), numerical_data, y, test_size=0.2, random_state=42)

# Building the improved ANN model
def build_improved_ann(text_dim, numeric_dim):
    # Input for text features
    text_input = Input(shape=(text_dim,), name='text_input')
    text_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(text_input)
    text_layer = Dropout(0.2)(text_layer)

    # Input for numerical features
    numeric_input = Input(shape=(numeric_dim,), name='numeric_input')
    numeric_layer = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(numeric_input)
    numeric_layer = Dropout(0.2)(numeric_layer)

    # Combine both layers
    merged = Concatenate()([text_layer, numeric_layer])
    merged_layer = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(merged)
    merged_layer = Dropout(0.2)(merged_layer)

    # Output layer
    output = Dense(1, activation='sigmoid')(merged_layer)

    # Build model
    model = Model(inputs=[text_input, numeric_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Training the ANN model
def train_ann(X_train_text, X_train_num, y_train):
    model = build_improved_ann(X_train_text.shape[1], X_train_num.shape[1])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

    # Train the model
    model.fit(
        [X_train_text, X_train_num], y_train,
        epochs=50, batch_size=32, validation_split=0.2,
        callbacks=[early_stopping, reduce_lr]
    )
    return model

# Evaluating the ANN model
def evaluate_model(model, X_test_text, X_test_num, y_test):
    y_pred = (model.predict([X_test_text, X_test_num]) > 0.5).astype(int)
    print("Model Evaluation:")
    print(classification_report(y_test, y_pred, zero_division=0))

# Main function
def main():
    # Load dataset
    print("Loading dataset...")
    data = pd.read_csv('mauritiusDataset.csv')  # Adjust path as needed
    data = preprocess_data(data)

    # Create TF-IDF matrix
    print("Creating TF-IDF matrix...")
    tfidf_matrix, vectorizer = create_tfidf_matrix(data)

    # Prepare data for training
    print("Preparing data for ANN training...")
    X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = prepare_ann_data(data, tfidf_matrix)

    # Train ANN
    print("Training ANN model...")
    model = train_ann(X_train_text, X_train_num, y_train)

    # Evaluate ANN
    print("Evaluating model...")
    evaluate_model(model, X_test_text, X_test_num, y_test)

    # Save model and vectorizer
    print("Saving model and vectorizer...")
    model.save('ann_model_improved.keras')
    with open('vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    print("Model and vectorizer saved.")

# Entry point for the script
if __name__ == "__main__":
    main()


Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: './data/mauritiusDataset.csv'