In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Load datasets
books_file_path = "C:\\Users\\hp\\Desktop\\AI Project\\Final model\\data\\BX-Books.csv"
users_file_path = "C:\\Users\\hp\\Desktop\\AI Project\\Final model\\data\\BX-Users.csv"
ratings_file_path = "C:\\Users\\hp\\Desktop\\AI Project\\Final model\\data\\BX-Book-Ratings.csv"  # Update with your file path

# Load data
books = pd.read_csv(books_file_path, encoding='latin1', sep=';', on_bad_lines='skip', low_memory=False)
users = pd.read_csv(users_file_path, encoding='latin1', sep=';', on_bad_lines='skip', low_memory=False)
ratings = pd.read_csv(ratings_file_path, encoding='latin1', sep=';', on_bad_lines='skip', low_memory=False)

# Merge datasets
merged_data = ratings.merge(users, on='User-ID').merge(books, on='ISBN')

# Drop rows with missing values
merged_data = merged_data.dropna()

# Select features and target
feature_columns = ['Age', 'Book-Rating']  # Add more numeric columns if needed
X = merged_data[feature_columns]
y = merged_data['Book-Rating']  # Example target: predicting book ratings

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline with scaling and logistic regression
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000, solver='saga'))

# Perform feature selection
n_features_to_select = min(X_train.shape[1], 10)  # Choose up to 10 features or the number of available features
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)

# Fit RFE to the training data
rfe.fit(X_train, y_train)

# Transform the data
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Train and evaluate the model
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)

# Print results
print("Feature Names:", X.columns.tolist())
print("Selected Features:", X.columns[rfe.support_].tolist())
print("Model Accuracy after Feature Selection:", accuracy)


Feature Names: ['Age', 'Book-Rating']
Selected Features: ['Age', 'Book-Rating']
Model Accuracy after Feature Selection: 0.9976768780781365
