<a href="https://colab.research.google.com/github/astonaIT/Coursera-Capstone-Repository/blob/master/Copy_of_PythonProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing all libaries and tools that will be used 
import pandas as pd

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.io import curdoc
from bokeh.palettes import Dark2_5 as palette
from bokeh.layouts import row

In [None]:
from random import choice

In [None]:
# setting up the colors of the graphs
output_notebook()
curdoc().theme = 'dark_minimal'

In [None]:
class NotADataFrameError(Exception): # NotADataFrameError inherits from Exception (built in Python already)


    def __init__(self, wrong_name, wrong_type, message=""):
        

        self.wrong_type = wrong_type 
        self.wrong_name = wrong_name

        if message == "": # If no message is provided
            self.message = f"Parameter {wrong_name} of type {wrong_type} should be a DataFrame" # Default message
        else:
            self.message = message # User-provided message

        super().__init__(self.message) # Esception.__init__(self.message)



In [None]:
#1 (create a class & inheritance for exception)  - Visualisation Set Up 
def plot_fn_dataset(df, title=""):
    """
    Plotting the functions represented in the dataframe. The parameters are the dataframe and the title (optional).
    Creating a line plot.   
    
    Show:typr
        fg -- the graph of the selected function 
    """
    
    src = ColumnDataSource(df) # coverting into bokeh
    fg = figure(title=title, x_axis_label="x", y_axis_label="y")

    for col in df:
        fg.line(x="x", y=col, line_color=choice(palette), source=src)

    show(fg)

In [None]:
#2 (Part 1)
def train_fit(train_df, ideal_df):
    """
    Finding the match among the data from the training dataset and their four best fits from the ideal dataset. 

    The best match is the function that minimises this calculation: (y_i_train - y_i_ideal)^2.
    
    Args:
        train_df {pd.DataFrame} -- Contains the training functions.
        ideal_df {pd.DataFrame} -- Contains the ideal functions.
    
    Return:
        {list} -- List containing the 4 chosen functions and the minimum value of all y- deviations squared (Least-Square).
    """

    # Handling Exceptions! We raise an error exception so that the user makes sure to use a dataframe for the training set.
    if type(train_df) != pd.DataFrame: 
        raise NotADataFrameError("train_df", type(train_df)) 
    if type(ideal_df) != pd.DataFrame:
        raise NotADataFrameError("ideal_df", type(ideal_df)) 

    chosen_functions = [] # creating an empty list where the choosen functions will be stored

    for train_col in train_df:
        df = pd.DataFrame()

        for ideal_col in ideal_df:
            df[ideal_col] = train_df[train_col] - ideal_df[ideal_col]
            df[ideal_col] = df[ideal_col].pow(2) # pow(2) => x**2

        df = df.sum()

        # print(df.idxmin(), df.min())
        chosen_functions.append((df.idxmin(), df.min())) # adding the new data of the chosen function to this list 

    return chosen_functions

In [None]:
#3 (Part 3)
def plot_chosen_fn(train_df, ideal_df, chosen_fn):
    """
    Ploting the four training functions with their best fits from the ideal functions. Creating a line plot. 

    Parameters: 
    train_df - the training dataframe (pd.dataframe)
    ideal_df - the ideal dataframe (pd.dataframe)
    chosen_fn - the four chosen functions (list)
    
    Show: 
    (the four graphs of the chosen functions displayed in a row)    
    """
    
    figures = []

    for n, train_col in enumerate(train_df): # enumerate => (0, train_df[col-0]), (1. train_df[col-1])... (a built-in function that pairs the index with the element)

        title = f"{train_col} -> {chosen_fn[n][0]}" # creating a string where the content of the variable inside the curly brakets can be printed (n chooses which table IpairI of the list we work with, O from that table I want the first element which is the column name. column name: name of the choosen function.)

        fig = figure(title=title, width=250, height=250) # making 'titile' a variable & setting up the width and height of each of the graph
        
        fig.line(x=ideal_df.index.values, y=ideal_df[chosen_fn[n][0]], line_color="blue") 
        fig.line(x=ideal_df.index.values, y=train_df[train_col], line_color="red")

        figures.append(fig)

    show(row(figures))


In [None]:
#4 (Part 3)
def plot_test_dataset(df, title=""):
    ''' Plotting the data points from the dataframe & creating a scatterplot. 
        Parameters: 
        df - the dataframe (pd.dataframe)
        title - the title (string), optional 

        Show: typr
        (displaying the scatterplot of the data points)
    '''
    
    src = ColumnDataSource(df)
    fg = figure(title=title, x_axis_label="x", y_axis_label="y")

    fg.dot(x="x", y="y", line_color="green", size=20, source=src) # Scatter plot

    show(fg)

Final program

In [None]:
#Part 3 
# Reading CSV's into DF, and setting "x" coulmn as index
ideal_df = pd.read_csv("ideal.csv").set_index("x")
test_df = pd.read_csv("test.csv").set_index("x")
train_df = pd.read_csv("train.csv").set_index("x")

plot_fn_dataset(ideal_df, "Ideal DataSet")
plot_fn_dataset(train_df, "Training DataSet")
plot_test_dataset(test_df, "Ordered Test") # Test data: This dots are going to get correlated (or not) with the 4 chosen functions

In [None]:
chosen_fn = train_fit(train_df, ideal_df) # Getting the 4 chosen functions
print(chosen_fn)

plot_chosen_fn(train_df, ideal_df, chosen_fn) # plotting the 4 traings functions with the ideal functions 

[('y43', 33.92135102938815), ('y6', 33.91528844821365), ('y46', 30.677841232448394), ('y18', 32.713144281234285)]


In [None]:
# When the plotting is right, convert this into proper function

from math import pow, sqrt

testmap_df = pd.DataFrame(columns=["X", "Y", "Delta-Y", "Chosen fn"])

for x, row in test_df.iterrows(): # Once per test point

    deviations = []

    for fn in chosen_fn: # 4 times -> 4 functions
        dev = pow(row["y"] - ideal_df.at[x, fn[0]], 2)
        deviations.append(dev)
    
    #print("BEFORE:", deviations)

    bias = max(chosen_fn, key=lambda x: x[1])[1] # For each element inside the list, chose the 2nd one to decide max
    bias += sqrt(2)

    biased = list(map(lambda x: bias-x, deviations)) # For each element in the list, apply this substraction

    #print("AFTER:", biased)

    rightFunctionVal = max(biased)
    if rightFunctionVal >= 0:
        rightFunctionIndex = biased.index(rightFunctionVal)
        #print("CORRECT FUNCTION =>", chosen_fn[rightFunctionIndex][0]) 
        # Take the name ([0]) from the needed element in chosen_fn

        # Add the data to the final dataframe (append doesn't modify the DF, so we need to do DF = DF.append(...))
        testmap_df = testmap_df.append({"X": x, "Y": row["y"], "Delta-Y": deviations[rightFunctionIndex], "Chosen fn": chosen_fn[rightFunctionIndex][0]}, ignore_index=True)


print(testmap_df)

       X           Y   Delta-Y Chosen fn
0   12.4  307.612950  0.008640       y18
1    1.7   -0.733565  0.066616        y6
2   15.2    0.067455  0.306754        y6
3    6.9    2.772236  0.706802       y43
4    3.3    1.587248  0.154705       y43
..   ...         ...       ...       ...
75   8.3   -2.291416  1.929998        y6
76 -15.0    1.254897  0.365552        y6
77 -14.6  426.245850  0.005498       y18
78  12.0    3.789068  0.996850       y46
79  -5.1   -2.142453  1.480209        y6

[80 rows x 4 columns]
