In [1]:
import numpy as np

class ExpertSystem:
    def __init__(self):
        self.dataset = None
        self.user_goals = {"interpretability": False, "speed": False, "accuracy": False}
        self.is_classification = None
        self.is_regression = None
        self.is_labeled = None
        self.num_samples = None
        self.num_features = None
        self.are_features_important = None
        self.data_type = None
        self.suggested_models = []

    def ask_question(self, question):
        answer = input(question + " (y/n): ")
        if answer == "y":
            return True
        elif answer == "n":
            return False
        else:
            print("Invalid answer. Please enter y or n.")
            return self.ask_question(question)

    def get_user_input(self):
        # Basic dataset-related questions
        self.is_classification = self.ask_question("Is your dataset for classification?")
        self.is_regression = self.ask_question("Is your dataset for regression?")
        self.is_labeled = self.ask_question("Is your dataset labeled?")
        self.num_samples = self.ask_question("Do you have more than 100,000 samples?")
        self.num_features = self.ask_question("Do you have more than 20 features?")
        self.are_features_important = self.ask_question("Are the features in your dataset important?")

        # Determine the data type without asking the user directly
        if self.is_classification and self.is_regression:
            self.data_type = "both"
        elif self.is_classification:
            self.data_type = "categorical"
        elif self.is_regression:
            self.data_type = "numerical"
        else:
            self.data_type = "other"

        if self.data_type == "other":
            self.handle_neither_data_type()

        # Additional dataset-related questions (you can expand this list)
        self.user_goals["feature_selection"] = self.ask_question("Do you need automatic feature selection?")
        self.user_goals["cross_validation"] = self.ask_question("Do you require cross-validation for model evaluation?")
        self.user_goals["handle_imbalance"] = self.ask_question("Does your dataset have class imbalance?")
        self.user_goals["handle_missing_data"] = self.ask_question("Does your dataset have missing data?")
        self.user_goals["handle_outliers"] = self.ask_question("Does your dataset have outliers?")
        self.user_goals["handle_text_data"] = self.ask_question("Does your dataset contain text data?")
        self.user_goals["handle_vision_data"] = self.ask_question("Is your dataset related to computer vision tasks?")
        self.user_goals["handle_time_series"] = self.ask_question("Is your dataset related to time series data?")
        self.user_goals["handle_image_data"] = self.ask_question("Does your dataset contain image data?")

    def handle_neither_data_type(self):
        print("It seems your dataset is neither purely categorical nor purely numerical.")
        print("Please provide additional information about the nature of your data.")
        print("For example, specify if your data consists of large text strings, images, or other formats.")

    def suggest_models(self):
        # Use the user's answers to suggest three models
        self.suggested_models = []

        # Model for Text Data
        if self.user_goals["handle_text_data"]:
            self.suggested_models.append(("Large Language Model", "Utilize large pre-trained language models (e.g., GPT, BERT) for text data processing."))
            self.suggested_models.append(("TF-IDF with Cosine Similarity", "Use TF-IDF representation and cosine similarity for text-based tasks."))
            self.suggested_models.append(("Naive Bayes Classifier", "Apply Naive Bayes classification for text data."))

        # Model for Computer Vision Data
        elif self.user_goals["handle_vision_data"]:
            self.suggested_models.append(("Convolutional Neural Network (CNN)", "CNNs are effective for image classification tasks."))
            self.suggested_models.append(("Transfer Learning with Pre-trained Models", "Use pre-trained models like ResNet or VGG for feature extraction in vision tasks."))
            self.suggested_models.append(("Object Detection Model", "For tasks involving detecting objects in images, consider models like YOLO or SSD."))

        # Model for Time Series Data
        elif self.user_goals["handle_time_series"]:
            self.suggested_models.append(("Autoregressive Integrated Moving Average (ARIMA)", "ARIMA is a time series forecasting model that considers autoregression and moving averages."))
            self.suggested_models.append(("Long Short-Term Memory (LSTM) Networks", "LSTMs are deep learning models designed for sequence prediction tasks, making them suitable for time series forecasting."))

        # Model for Image Data
        elif self.user_goals["handle_image_data"]:
            self.suggested_models.append(("Convolutional Neural Network (CNN)", "CNNs are effective for image classification tasks."))
            self.suggested_models.append(("Transfer Learning with Pre-trained Models", "Use pre-trained models like ResNet or VGG for image classification tasks."))

        # Traditional Models
        else:
            # Model 1
            if self.is_classification:
                if self.is_labeled:
                    if self.num_samples:
                        if self.num_features:
                            if self.data_type == "categorical":
                                if self.user_goals["interpretability"]:
                                    self.suggested_models.append(("DecisionTreeClassifier", "Decision Tree Classifier is suitable for classification tasks, providing a clear decision-making process."))
                                elif self.user_goals["speed"]:
                                    self.suggested_models.append(("LogisticRegression", "Logistic Regression is a good choice for binary classification, providing probabilities for class membership."))
                                else:
                                    self.suggested_models.append(("RandomForestClassifier", "Random Forest Classifier is an ensemble method that can handle complex relationships in data and reduce overfitting."))
                        else:
                            self.suggested_models.append(("KNeighborsClassifier", "K-Nearest Neighbors Classifier is a simple and effective algorithm, suitable for small to medium-sized datasets."))
                    else:
                        self.suggested_models.append(("LogisticRegression", "Logistic Regression is a good choice for binary classification, providing probabilities for class membership."))
                else:
                    self.suggested_models.append(("DBSCAN", "DBSCAN is a density-based clustering algorithm suitable for discovering clusters of arbitrary shapes."))
            elif self.is_regression:
                if self.num_samples:
                    if self.data_type == "numerical":
                        if self.user_goals["accuracy"]:
                            self.suggested_models.append(("SVR", "Support Vector Regression is effective for regression tasks, especially when dealing with non-linear data."))
                        else:
                            self.suggested_models.append(("LinearRegression", "Linear Regression is a simple and interpretable model, suitable for linear relationships in data."))
            else:
                if self.are_features_important:
                    self.suggested_models.append(("KMeans", "K-Means Clustering is suitable for unsupervised learning tasks, partitioning data into clusters."))

            # Model 2
            if self.suggested_models and self.suggested_models[0][0] == "DecisionTreeClassifier":
                self.suggested_models.append(("RandomForestClassifier", "Random Forest Classifier is an ensemble method that can handle complex relationships in data and reduce overfitting."))

            # Model 3
            if self.suggested_models and self.suggested_models[0][0] == "RandomForestClassifier":
                self.suggested_models.append(("LinearSVC", "Linear Support Vector Classification is effective for linearly separable data in classification tasks."))

    def explain_models(self):
        # Print the ranked list of suggested models with explanations
        print("\nSummary:")
        selected_goals = [key.capitalize() for key, value in self.user_goals.items() if value]
        print(f"You have selected: {', '.join(selected_goals)}.")

        for rank, (model, explanation) in enumerate(self.suggested_models[:3], start=1):
            print(f"\nRank {rank}: {model}")
            print(explanation)
            self.print_model_details(model)

    def print_model_details(self, model):
        # Print detailed information about the suggested model
        if model == "Large Language Model":
            print("\nModel Details:")
            print("Large language models, such as GPT and BERT, are pre-trained on massive amounts of text data.")
            print("They excel at understanding and generating human-like text, making them suitable for various natural language processing tasks.")
            print("You can fine-tune these models for specific tasks like sentiment analysis, text classification, or language translation.")

        # Add details for other new models
        elif model == "TF-IDF with Cosine Similarity":
            print("\nModel Details:")
            print("TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic that reflects how important a word is to a document.")
            print("It is often used in conjunction with cosine similarity to measure the similarity between documents.")
            print("This approach is effective for text-based tasks such as document retrieval, clustering, and similarity analysis.")

        elif model == "Naive Bayes Classifier":
            print("\nModel Details:")
            print("Naive Bayes is a probabilistic classification algorithm based on Bayes' theorem.")
            print("It assumes that features are independent, given the class label, which simplifies the computation.")
            print("Naive Bayes is commonly used for text classification, spam filtering, and sentiment analysis.")

        elif model == "Convolutional Neural Network (CNN)":
            print("\nModel Details:")
            print("CNNs are deep learning models designed for processing structured grid data, such as images.")
            print("They use convolutional layers to automatically and adaptively learn hierarchical representations of the input data.")
            print("CNNs are widely used in image classification, object detection, and image segmentation tasks.")

        elif model == "Transfer Learning with Pre-trained Models":
            print("\nModel Details:")
            print("Transfer learning involves using a pre-trained model on a large dataset and fine-tuning it for a specific task.")
            print("Common pre-trained models include ResNet, VGG, and Inception, which have learned rich features from large image datasets.")
            print("Transfer learning is effective for tasks with limited labeled data, such as image recognition.")

        elif model == "Object Detection Model":
            print("\nModel Details:")
            print("Object detection models are designed to identify and locate objects within an image.")
            print("Models like YOLO (You Only Look Once) and SSD (Single Shot Multibox Detector) are popular choices.")
            print("These models are used in applications such as autonomous vehicles, surveillance, and image-based search.")

        elif model == "Autoregressive Integrated Moving Average (ARIMA)":
            print("\nModel Details:")
            print("ARIMA is a time series forecasting model that considers autoregression, differencing, and moving averages.")
            print("It is suitable for univariate time series data and can be used to make predictions based on historical observations.")

        elif model == "Long Short-Term Memory (LSTM) Networks":
            print("\nModel Details:")
            print("LSTMs are a type of recurrent neural network (RNN) designed for sequence prediction tasks.")
            print("They are effective for capturing long-term dependencies in sequential data, making them suitable for time series forecasting.")

        # Add details for other new models
        elif model == "K-Nearest Neighbors (KNN)":
            print("\nModel Details:")
            print("K-Nearest Neighbors is a simple and versatile algorithm used for both classification and regression tasks.")
            print("It classifies a data point based on the majority class of its k-nearest neighbors in the feature space.")
            print("KNN is suitable for small to medium-sized datasets, but it may be computationally expensive for large datasets.")

        elif model == "Support Vector Machines (SVMs)":
            print("\nModel Details:")
            print("Support Vector Machines are powerful algorithms used for both classification and regression tasks.")
            print("They work by finding the hyperplane that best separates classes in the feature space.")
            print("SVMs are effective for high-dimensional data and can handle non-linear relationships through kernel tricks.")

        elif model == "Regression Models":
            print("\nModel Details:")
            print("Regression models are used for predicting a continuous target variable.")
            print("Common regression models include Linear Regression, Polynomial Regression, and Support Vector Regression (SVR).")
            print("These models are suitable when the relationship between features and the target variable is continuous.")

        elif model == "Linear Regression":
            print("\nModel Details:")
            print("Linear Regression is a simple and interpretable model used for predicting a continuous target variable.")
            print("It assumes a linear relationship between the features and the target variable.")
            print("Linear Regression is suitable when the relationship is approximately linear.")

        elif model == "Polynomial Regression":
            print("\nModel Details:")
            print("Polynomial Regression is an extension of linear regression that allows for modeling non-linear relationships.")
            print("It involves fitting a polynomial equation to the data, providing flexibility in capturing complex patterns.")
            print("Polynomial Regression is useful when the relationship is not linear.")

        elif model == "Density-Based Spatial Clustering of Applications with Noise":
            print("\nModel Details:")
            print(" (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm.")
            print("It groups data points that are close to each other and marks outliers as noise.")
            print("DBSCAN is effective for discovering clusters of arbitrary shapes and handling outliers.")

        elif model == "Spectral Clustering":
            print("\nModel Details:")
            print("Spectral Clustering is a graph-based clustering technique that uses the eigenvalues of the similarity matrix.")
            print("It is useful for identifying clusters with non-convex shapes and can capture complex relationships in the data.")

        elif model == "Gaussian Mixture Models (GMMs)":
            print("\nModel Details:")
            print("Gaussian Mixture Models represent data as a mixture of multiple Gaussian distributions.")
            print("Each component corresponds to a cluster, and data points are probabilistically assigned to clusters.")
            print("GMMs are versatile and can model complex data distributions.")

        elif model == "Dimensionality Reduction Models":
            print("\nModel Details:")
            print("Dimensionality reduction models aim to reduce the number of features in the data while preserving important information.")
            print("Common dimensionality reduction models include Logistic Regression, Principal Component Analysis (PCA), and Linear Discriminant Analysis (LDA).")

        elif model == "Logistic Regression":
            print("\nModel Details:")
            print("Logistic Regression is a popular model for binary classification tasks.")
            print("It models the probability of the class being 1 and is suitable for problems with a linear decision boundary.")

        elif model == "Principal Component Analysis (PCA)":
            print("\nModel Details:")
            print("Principal Component Analysis is a technique for dimensionality reduction.")
            print("It identifies the principal components in the data, allowing for a lower-dimensional representation while retaining key information.")

        elif model == "Linear Discriminant Analysis (LDA)":
            print("\nModel Details:")
            print("Linear Discriminant Analysis is a method for both dimensionality reduction and classification.")
            print("It seeks to find the linear combinations of features that best separate different classes in the data.")

        elif model == "Q-learning":
            print("\nModel Details:")
            print("Q-learning is a model-free reinforcement learning algorithm.")
            print("It learns a policy by interacting with an environment, assigning values (Q-values) to state-action pairs to optimize decision-making.")

        elif model == "Policy Gradient Methods":
            print("\nModel Details:")
            print("Policy Gradient Methods are reinforcement learning algorithms that directly optimize the policy function.")
            print("They aim to maximize the expected cumulative reward by adjusting the parameters of the policy.")

        elif model == "Bagging":
            print("\nModel Details:")
            print("Bagging, or Bootstrap Aggregating, is an ensemble technique that builds multiple models on bootstrap samples of the data.")
            print("It reduces overfitting and variance by averaging the predictions of individual models.")

        elif model == "Boosting":
            print("\nModel Details:")
            print("Boosting is an ensemble method that combines weak learners to create a strong learner.")
            print("It focuses on correcting errors made by previous models, emphasizing the importance of misclassified instances.")

        elif model == "TensorFlow":
            print("\nModel Details:")
            print("TensorFlow is an open-source machine learning framework developed by the Google Brain team.")
            print("It provides a comprehensive ecosystem for building and deploying machine learning models, especially deep neural networks.")

        elif model == "PyTorch":
            print("\nModel Details:")
            print("PyTorch is an open-source deep learning framework developed by Facebook.")
            print("It is known for its dynamic computational graph, making it flexible for research and development in deep learning.")

        elif model == "Keras":
            print("\nModel Details:")
            print("Keras is an open-source high-level neural network API written in Python.")
            print("It is designed for fast experimentation with deep neural networks and can run on top of TensorFlow, Theano, or Microsoft Cognitive Toolkit.")

        elif model == "spaCy":
            print("\nModel Details:")
            print("spaCy is an open-source natural language processing library for Python.")
            print("It provides efficient tools for tasks such as tokenization, part-of-speech tagging, named entity recognition, and more.")

        elif model == "NLTK":
            print("\nModel Details:")
            print("NLTK (Natural Language Toolkit) is a comprehensive library for natural language processing in Python.")
            print("It includes modules for text processing, classification, tokenization, stemming, tagging, parsing, and semantic reasoning.")

        elif model == "Hugging Face Transformers":
            print("\nModel Details:")
            print("Hugging Face Transformers is an open-source library providing pre-trained models for natural language processing.")
            print("It offers a wide range of transformer-based models, including BERT, GPT, and more, for various NLP tasks.")

        elif model == "Recommender Systems Models":
            print("\nModel Details:")
            print("Recommender Systems Models are designed to provide personalized recommendations to users.")
            print("Common approaches include collaborative filtering, content-based filtering, and hybrid methods combining both.")

        else:
            print("\nModel Details:")
            print("This is a placeholder for model details. You can provide specific details for this model.")

    def run_expert_system(self):
        self.get_user_input()
        self.suggest_models()
        self.explain_models()

# Example usage:
expert_system = ExpertSystem()
expert_system.run_expert_system()




Is your dataset for classification? (y/n):  y
Is your dataset for regression? (y/n):  y
Is your dataset labeled? (y/n):  y
Do you have more than 100,000 samples? (y/n):  yy


Invalid answer. Please enter y or n.


Do you have more than 100,000 samples? (y/n):  y
Do you have more than 20 features? (y/n):  y
Are the features in your dataset important? (y/n):  y
Do you need automatic feature selection? (y/n):  y
Do you require cross-validation for model evaluation? (y/n):  y
Does your dataset have class imbalance? (y/n):  y
Does your dataset have missing data? (y/n):  y
Does your dataset have outliers? (y/n):  y
Does your dataset contain text data? (y/n):  y
Is your dataset related to computer vision tasks? (y/n):  y
Is your dataset related to time series data? (y/n):  y
Does your dataset contain image data? (y/n):  y



Summary:
You have selected: Feature_selection, Cross_validation, Handle_imbalance, Handle_missing_data, Handle_outliers, Handle_text_data, Handle_vision_data, Handle_time_series, Handle_image_data.

Rank 1: Large Language Model
Utilize large pre-trained language models (e.g., GPT, BERT) for text data processing.

Model Details:
Large language models, such as GPT and BERT, are pre-trained on massive amounts of text data.
They excel at understanding and generating human-like text, making them suitable for various natural language processing tasks.
You can fine-tune these models for specific tasks like sentiment analysis, text classification, or language translation.

Rank 2: TF-IDF with Cosine Similarity
Use TF-IDF representation and cosine similarity for text-based tasks.

Model Details:
TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic that reflects how important a word is to a document.
It is often used in conjunction with cosine similarity to measure the si