# UFCFVQ-15-M Programming for Data Science (Autumn 2023)


## Student Id:

# Programming Task 1

### Requirement FR1 - Develop a function to read a single column from a CSV file

In [1]:
def read_column(file_name, column_number):
    """
    Reads a single specified column of data from a CSV file.

    PARAMETERS
    - file_name (String): The name of the CSV file.
    - column_number (Integer): The number of the column to read (0-based).

    RETURNS: 
    - column_name (String): the column name
    - selected_column_values (List): a List containing all the specified column's data values.
    """

    # The rows of the file
    rows = []
    # The values of the selected column
    selected_column_values = []

    with open(file_name,"r", encoding='utf-8-sig') as file:
        
        # Iterate through each line of the CSV file
        for row in file:
            # Drop the escape characters at the end of the elements. Split each row into a list of values.
            rows.append(row.strip().split(","))

    # Total number of columns 0-based
    total_columns = len(rows[0]) - 1

    if column_number < 0 or column_number > total_columns:
        raise ValueError("Please enter a column number between 0 and " + str(total_columns))
    else:
        for i in range(0, len(rows)):
            if i == 0:
                column_name = rows[i][column_number]
            else:
                selected_column_values.append(int(rows[i][column_number]))
            
        return column_name, selected_column_values

### Requirement FR2 - Develop a function to read CSV data from a file into memory

In [2]:
def read_to_dictionary(file_name):
    """
    Reads all columns of data from the data file and add them to a Dictionary data structure.
    Makes use of the read_column() function.

    PARAMETERS
    - file_name (String): The name of the CSV file.

    RETURNS
    - dictionary_output (Dictionary): The dictionary contains one entry for each column in the CSV data file.
    """
    dictionary_output = {}

    total_column_number = 0

    # Get the total number of the columns in the csv file and assign the result to the total_column_number.
    with open(file_name,"r", encoding='utf-8-sig') as file:
        columns = file.readline().strip().split(",")
        total_column_number = len(columns)

    # Pass the total_column_number as a dictionary
    for column in range(0, total_column_number):
        # Unpack the return values of the read_column() function.
        (dict_key, dict_value) = read_column(file_name, column)
        # Assign the results to the dictionary_output.
        dictionary_output[dict_key] = dict_value

    return dictionary_output

### Requirement FR3 - Develop a function to calculate the Kendall Tau Correlation Coefficient for two lists of data

In [3]:
def calculate_kendall_tau(list1, list2):
    """
    Calculates the Kendall Tau Rank Correlation Coefficient for two lists of data.

    PARAMETERS
    - list1 (List): A list.
    - list2 (List): A list.

    * Two lists should be of equal length.

    RETURNS
    - The calculated coefficient value. 
    """

    # Number of the values in each list
    n = len(list1)

    # Variables to hold values
    number_of_concordants = 0
    number_of_discordants = 0
    ties_in_list1 = 0
    ties_in_list2 = 0

    # Check if the two lists are of equal length
    if len(list1) == len(list2):
    
    # Calculate the variables
        for x in range(n):
            for y in range(x+1, n):
                if (list1[x] < list1[y] and list2[x] < list2[y]) or (list1[x] > list1[y] and list2[x] > list2[y]):
                    number_of_concordants += 1
                elif (list1[x] < list1[y] and list2[x] > list2[y]) or (list1[x] > list1[y] and list2[x] < list2[y]):
                    number_of_discordants += 1
                elif list1[x] == list1[y] and list2[x] != list2[y]:
                    ties_in_list1 += 1
                elif list2[x] == list2[y] and list1[x] != list1[y]:
                    ties_in_list2 += 1
                elif list1[x] == list1[y] and list2[x] == list2[y]: # Discard if a tie occurs for the same pair in both lists.
                    pass

        # Assign the calculations to variables
        numerator = number_of_concordants - number_of_discordants
        denominator = (((number_of_concordants + number_of_discordants + ties_in_list1) * (number_of_concordants + number_of_discordants + ties_in_list2)) ** 0.5)
        
        if denominator == 0:
            return 0
        else:
            return numerator / denominator
    else:
        raise ValueError("The two lists should be of equal length")

### Requirement FR4 - Develop a function to generate a set of Kendall Tau Correlation Coefficients for a data structure like the one generated in FR2

In [10]:
def generate_kendall_tau(a_dict = read_to_dictionary):
    """ 
    Generates a set of Kendall Rank Correlation Coefficients for a given data file. Uses the calculate_kendall_tau() function.
    
    PARAMETERS
    - A dictionary. Default: the dictionary output read_dict from read_to_Dictionary() function

    RETURNS
    - A list of tuples, each tuple containing the two column names and associated correlation coefficient value.
    """

    # The keys of the dictionary.
    keys = []

    # The results of the Kendall Tau Correlation Coefficients.
    results = []

    # Iterate through the keys and add each key to keys list.
    for key, value in a_dict.items():
        keys.append(key)

    # Number of the elements in the dictionary
    n = len(keys)

    # Iterate through all pairs of elements. 
    for x in range(n):
        for y in range(x + 1, n):
            if x != y: # Discard pairs of the same column.
                first_value = a_dict[keys[x]]
                second_value = a_dict[keys[y]]
                calculated_kendall_tau = calculate_kendall_tau(first_value, second_value)

                # Add the column names and the Kendall Tau Correlation Coefficient to the results list as tuples.
                results.append((keys[x], keys[y], calculated_kendall_tau))
    
    return results


### Requirement FR5 - Develop a function to print a custom table for selected data from a data structure like the one generated in FR4

In [20]:
def print_custom_table(list_of_tuples, border_char = "*", columns_to_include = None):

    """ 
    Prints a dynamic table for Kendall Tau Rank Correlation Coefficients.

    PARAMETERS
    - list_of_tuples (List): List of tuples containing column names and correlation coefficients.
    - border_char (String): Border character to use in the table. Default is '*'.
    - columns_to_include (List): List of columns to include in the table. Default includes all columns.

    RETURNS
    - None
    """
    
    # If the columns_to_include not provided
    if columns_to_include == None:
        columns_to_include = []
    
        # Iterate over each tuple in list_of_tuples
        for each_tuple in list_of_tuples:
            # Iterate over the first two elements and add to the columns_to_include list
            for each_column in each_tuple[:2]:
                # Assure the columns names are unique
                if each_column not in columns_to_include:
                    columns_to_include.append(each_column)

    # List to store column names
    column_names = columns_to_include

    # Dictionary to store correlation coefficients
    coefficients = {}

    # Iterate over each tuple in list_of_tuples
    for first_column, second_column, coefficient in list_of_tuples:
        # Key (first_column, second_column) : value (coefficient)
        coefficients[(first_column, second_column)] = coefficient

    # Calculate the length of the longest column name
    max_column_length = 0

    # Iterate over each column name in column_names
    for col in column_names:
        max_column_length = max(max_column_length, len(col))

    # Create the header row with centered column names and a border_char between each column padded with empty spaces.
    # Longest column name has 1 space at the start and end.
    header = f"{border_char}{''.join(f'{col.center(max_column_length + 1).ljust((max_column_length + 1) + len(border_char))}{border_char}' for col in column_names)}{border_char}"


    header_length = len(header)
    # Length of the space to add at the beginning of the first line and header
    space_length = max_column_length + 2
    first_line = border_char * header_length
    print(" " * space_length + first_line)
    print(" " * space_length + header)
    print(border_char * space_length + first_line)
    
    # Iterate over each column in column_names
    for col1 in column_names:
        # Row with the first column name centered and padded
        row = f"{border_char} {col1.center(max_column_length)} {border_char}"

        # Iterate over each column in column_names again to fill in the values
        for col2 in column_names:
            # If the columns are the same, print "-"
            if col1 == col2:
                value = "-"
            else:
                # Otherwise, get the coefficient and format it with 4 decimal places
                value = f'{coefficients.get((col1, col2), coefficients.get((col2, col1), "-")):.4f}'

            # Add the value centered within max_column_length
            row += f" {value.center(max_column_length)}{border_char} "

        print(row)

        # Create and print the footer row
        print(border_char * len(row))