In [4]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import Table, Column, Integer, Float, MetaData
import numpy as np
from sklearn.linear_model import LinearRegression
from bokeh.plotting import figure, show
from bokeh.layouts import column
from sklearn.metrics import mean_squared_error


class DataHandler:
    def __init__(self):
        self.training_data = None
        self.ideal_functions = None
        self.test_data = None
        self.db_engine = None
        self.metadata = MetaData()  

    def load_data(self):
        # Loading training, ideal functions, and test data from CSV files into Pandas DataFrames
        self.training_data = pd.read_csv('train.csv')
        self.ideal_functions = pd.read_csv('ideal.csv')
        self.test_data = pd.read_csv('test.csv')

    def create_database(self):
        # Creating SQLite database and connect using SQLAlchemy
        self.db_engine = create_engine('sqlite:///database.db', echo=False)

        # Defining tables for training data, ideal functions, and test data
        training_table = Table('training_data', self.metadata,
                               Column('X', Float),
                               Column('Y1', Float),
                               Column('Y2', Float),
                               Column('Y3', Float),
                               Column('Y4', Float))

        # Defining columns for 50 ideal functions dynamically
        ideal_functions_columns = [Column(f'Y{i}', Float) for i in range(1, 51)]
        ideal_functions_table = Table('ideal_functions', self.metadata,
                                      Column('X', Float),
                                      *ideal_functions_columns)

        test_data_table = Table('test_data', self.metadata,
                                Column('X', Float),
                                Column('Y', Float),
                                Column('Delta_Y', Float),
                                Column('No_of_ideal_func', Integer))

        # Creating tables in the database
        self.metadata.create_all(self.db_engine)

        # Inserting data into tables
        self.training_data.to_sql('training_data', self.db_engine, if_exists='replace', index=False)
        self.ideal_functions.to_sql('ideal_functions', self.db_engine, if_exists='replace', index=False)
        self.test_data.to_sql('test_data', self.db_engine, if_exists='replace', index=False)

       
        
    def regression_analysis(self):
        """
        Performing regression analysis to select the best-fit ideal functions for training data using Least Squares.

        Steps:
        1. Separate X and Y values from the training dataset.
        2. Fitting Linear Regression to the training data for each Y column.
        3. Choose the top four ideal functions based on the sum of squared deviations.

        """
        # Extracting X and Y values from the training dataset
        X_train = self.training_data['X'].values.reshape(-1, 1)  # Assuming X is a single feature
        Y_train = self.training_data[['Y1', 'Y2', 'Y3', 'Y4']].values  # Adjust columns based on your data

        # Performing regression analysis for each Y column
        regression_models = []
        sum_squared_deviations = []
        for i in range(Y_train.shape[1]):  # Iterate through each Y column
            model = LinearRegression()
            model.fit(X_train, Y_train[:, i])
            regression_models.append(model)

            # Calculating squared deviations between predicted Y values and training data Y values
            y_pred = model.predict(X_train)
            squared_deviations = np.sum((Y_train[:, i] - y_pred) ** 2)
            sum_squared_deviations.append(squared_deviations)

        # Selecting the top four ideal functions based on sum of squared deviations
        # Sorting indices based on sum of squared deviations (lower is better)
        top_four_indices = np.argsort(sum_squared_deviations)[:4]

        # Updating the chosen ideal functions in the class attribute for further use
        chosen_ideal_functions = [regression_models[i] for i in top_four_indices]
        self.chosen_ideal_functions = chosen_ideal_functions

    def map_test_data(self):
        """
        Map test data to chosen ideal functions and calculate deviations.

        Steps:
        1. Use the chosen ideal functions to predict Y values for test data X values.
        2. Calculate deviations between predicted Y values and test data Y values.
        3. Assign test data to the chosen ideal functions if deviation is within the specified threshold.

        """
        if not hasattr(self, 'chosen_ideal_functions'):
            raise ValueError("Chosen ideal functions not available. Run regression_analysis first.")

     

        deviations = []
        assigned_functions = []

        for idx, row in self.test_data.iterrows():
            x_test = row['X']
            y_test = row['Y']

            # Predicting Y values using chosen ideal functions for this test data point
            predicted_ys = [model.predict([[x_test]])[0] for model in self.chosen_ideal_functions]

            # Calculating deviations between predicted Y values and test data Y value
            deviation = [abs(y_pred - y_test) for y_pred in predicted_ys]
            deviations.append(deviation)

            # Checking if the deviation is within the specified threshold
            if all(dev <= np.sqrt(2) * max(deviation) for dev in deviation):
                assigned_functions.append(np.argmin(deviation) + 1)  

        # Storing deviations and assigned functions for test data
        self.test_data['Deviation'] = deviations
        self.test_data['Assigned_Function'] = assigned_functions
    
    def visualize_data(self):
        """
        Using Bokeh visualization library to plot training data, ideal functions, test data, and deviations.
        """
        if not hasattr(self, 'chosen_ideal_functions'):
            raise ValueError("Chosen ideal functions not available. Run regression_analysis first.")
        if not hasattr(self, 'test_data'):
            raise ValueError("Test data not available. Load test data first.")

        # Creating a Bokeh figure
        p = figure(title="Data Visualization", x_axis_label='X', y_axis_label='Y')

        # Plotting training data
        p.circle(self.training_data['X'], self.training_data['Y1'], legend_label='Training Y1', color='blue')
       

        # Plotting ideal functions
        x_vals = np.linspace(self.training_data['X'].min(), self.training_data['X'].max(), 100)
        for idx, model in enumerate(self.chosen_ideal_functions):
            y_vals = model.predict(x_vals.reshape(-1, 1))
            p.line(x_vals, y_vals, legend_label=f'Ideal Function {idx+1}', line_width=2, color='green')

        # Plotting test data
        p.circle(self.test_data['X'], self.test_data['Y'], legend_label='Test Data Y', color='red')

        # Plotting deviations (if available)
        if 'Deviation' in self.test_data.columns:
            deviations = [max(dev) for dev in self.test_data['Deviation']]
            p.circle(self.test_data['X'], deviations, legend_label='Deviations', color='orange')

        # Show the plot
        show(p)
                                      
    def run(self):
        self.load_data()
        self.create_database()
        self.regression_analysis()
        self.map_test_data()
        self.visualize_data()
        
        
        
        
if __name__ == "__main__":
    # Create instances of DataHandler and GitHandler
    data_handler = DataHandler()

    # Execute the data handling process
    data_handler.run()

    
    