### For Local Files ###

In [27]:
from collections import Counter
import pandas as pd
import chardet


def check_file_validity(file_path):
    """
    Checks if the txt file is valid:
        1. Has valid delimiter.
        2. Detects the file encoding.

    Args:
        1. file_path(str): Path to the file to be validated.

    Returns:
        1. dict:{
                "error": str,
                "file_valid": bool,
                "delimiter": char,
                "file_encoding": str,
            }
    """

    with open(file_path, "rb") as byte_file:
        file_content = byte_file.read()

    with open(file_path, "r") as file:
        lines = file.readlines()
    if not lines:
        return {"error": "File is empty", "file_valid": False}

    delimiter_obj = check_delimiter(file_lines=lines)
    if not delimiter_obj.get("delimiter_found"):
        return {"error": delimiter_obj.get("error"), "file_valid": False}

    delimiter = delimiter_obj.get("delimiter")
    file_encoding = get_file_encoding(file_content=file_content)
    return {
        "error": None,
        "file_valid": True,
        "delimiter": delimiter,
        "file_encoding": file_encoding,
    }


def check_delimiter(file_lines):
    """
    Checks for a consistent delimiter used in the file

    Args:
        1. file_lines(list): List of lines read from a file.

    Returns:
        1. dict:{
                "error": str,
                "delimiter_found": bool,
                "column_numbers": int,
                "delimiter": char,
            }
    """
    valid_delimiters = [",", "#", ";", ":", "\t", " ", "|"]
    first_line = file_lines[0]
    delimiter_counts = Counter(ch for ch in first_line if ch in valid_delimiters)

    if not delimiter_counts:
        return {
            "error": "Cannot determine a consistent delimiter",
            "delimiter_found": False,
            "column_numbers": None,
            "delimiter": None,
        }

    delimiter, _ = delimiter_counts.most_common(1)[0]
    column_numbers = len(first_line.split(delimiter))

    return {
        "error": None,
        "delimiter_found": True,
        "column_numbers": column_numbers,
        "delimiter": delimiter,
    }


def get_file_encoding(file_content):
    """
    Returns file encoding standard.

    Args:
        1. file_content(bytes): file content read in binary mode.

    Returns:
        file_encoding(str): Encoding standard used for the file.
    """
    file_encoding = chardet.detect(file_content).get("encoding", "utf-8")
    return file_encoding


def get_dataframe(file_url):
    """
    Gets the pandas dataframe from the validated file

    Args:
        1. file_path(str): Path to the file.

    Returns:
        1. dict:{
                "error": str,
                "file_valid": bool,
                "dataframe": pandas.core.frame.DataFrame,
            }
    """
    validity_obj = check_file_validity(file_path=file_url)
    if not validity_obj.get("file_valid"):
        return {
            "error": validity_obj.get("error"),
            "file_valid": False,
            "dataframe": None,
        }
    try:
        df = pd.read_csv(
            file_url,
            sep=validity_obj.get("delimiter"),
            encoding=validity_obj.get("file_encoding"),
        )
    except Exception as e:
        return {
            "error": f"Failed to read the file: {str(e)}",
            "file_valid": False,
            "dataframe": None,
        }

    return {"error": None, "file_valid": True, "dataframe": df}

In [28]:
# analyze_file("test.txt")
get_dataframe("test.txt").get("dataframe")


<class 'bytes'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Column1,Column2,Column3,Column4,Column5
0,Row1_Col1,Row1_Col2,Row1_Col4,Row1_Col5,
1,Row2_Col1,Row2_Col2,Row2_Col3,Row2_Col5,
2,Row3_Col1,Row3_Col2,Row3_Col3,Row3_Col4,Row3_Col5
3,Row4_Col1,Row4_Col2,Row4_Col3,Row4_Col4,Row4_Col5
4,Row5_Col1,Row5_Col2,Row5_Col3,Row5_Col4,Row5_Col5
5,Row6_Col1,Row6_Col2,Row6_Col3,Row6_Col4,Row6_Col5
6,Row7_Col1,Row7_Col2,Row7_Col3,Row7_Col4,Row7_Col5
7,Row8_Col1,Row8_Col2,Row8_Col3,Row8_Col4,Row8_Col5
8,Row9_Col1,Row9_Col2,Row9_Col3,Row9_Col4,Row9_Col5
9,Row10_Col1,Row10_Col2,Row10_Col3,Row10_Col4,Row10_Col5


### For File URL ###


In [29]:
from collections import Counter
import requests
import chardet
import pandas as pd


def check_file_validity(file_url):
    """
    Checks if the txt file is valid:
        1. Has valid delimiter.
        2. Detects the file encoding.

    Args:
        1. file_path(str): Path to the file to be validated.

    Returns:
        1. dict:{
                "error": str,
                "file_valid": bool,
                "delimiter": char,
                "file_encoding": str,
            }
    """

    response = requests.get(file_url)
    if response.status_code != 200:
        return {
            "error": f"Failed to fetch file. Status code: {response.status_code}",
            "file_valid": False,
        }

    content = response.text
    lines = content.splitlines()
    if not lines:
        return {"error": "File is empty", "file_valid": False}

    delimiter_obj = check_delimiter(file_lines=lines)
    if not delimiter_obj.get("delimiter_found"):
        return {"error": delimiter_obj.get("error"), "file_valid": False}

    delimiter = delimiter_obj.get("delimiter")
    file_encoding = get_file_encoding(file_content=response.content)
    return {
        "error": None,
        "file_valid": True,
        "delimiter": delimiter,
        "file_encoding": file_encoding,
    }


def check_delimiter(file_lines):
    """
    Checks for a consistent delimiter used in the file

    Args:
        1. file_lines(list): List of lines read from a file.

    Returns:
        1. dict:{
                "error": str,
                "delimiter_found": bool,
                "column_numbers": int,
                "delimiter": char,
            }
    """
    valid_delimiters = [",", "#", ";", ":", "\t", " ", "|"]
    first_line = file_lines[0]
    delimiter_counts = Counter(ch for ch in first_line if ch in valid_delimiters)

    if not delimiter_counts:
        return {
            "error": "Cannot determine a consistent delimiter",
            "delimiter_found": False,
            "column_numbers": None,
            "delimiter": None,
        }

    delimiter, _ = delimiter_counts.most_common(1)[0]
    column_numbers = len(first_line.split(delimiter))

    return {
        "error": None,
        "delimiter_found": True,
        "column_numbers": column_numbers,
        "delimiter": delimiter,
    }


def get_file_encoding(file_content):
    """
    Returns file encoding standard.

    Args:
        1. file_content(bytes): file content read in binary mode.

    Returns:
        file_encoding(str): Encoding standard used for the file.
    """

    file_encoding = chardet.detect(file_content).get("encoding", "utf-8")
    return file_encoding


def get_dataframe(file_url):
    """
    Gets the pandas dataframe from the validated file

    Args:
        1. file_path(str): Path to the file.

    Returns:
        1. dict:{
                "error": str,
                "file_valid": bool,
                "dataframe": pandas.core.frame.DataFrame,
            }
    """

    validity_obj = check_file_validity(file_url=file_url)
    if not validity_obj.get("file_valid"):
        return {
            "error": validity_obj.get("error"),
            "file_valid": False,
            "dataframe": None,
        }
    try:
        df = pd.read_csv(
            file_url,
            sep=validity_obj.get("delimiter"),
            encoding=validity_obj.get("file_encoding"),
        )
    except Exception as e:
        return {"error": f"Failed to read the file: {str(e)}", "file_valid": False}

    return {"error": None, "file_valid": True, "dataframe": df}

In [30]:
result_obj = get_dataframe(file_url="https://raw.githubusercontent.com/codeforamerica/ohana-api/refs/heads/master/data/sample-csv/addresses.csv")
print(result_obj.get("dataframe"))

    id  location_id                   address_1  address_2  \
0    1            1       2600 Middlefield Road        NaN   
1    2            2            24 Second Avenue        NaN   
2    3            3            24 Second Avenue        NaN   
3    4            4            24 Second Avenue        NaN   
4    5            5            24 Second Avenue        NaN   
5    6            6           800 Middle Avenue        NaN   
6    7            7              500 Arbor Road        NaN   
7    8            8           800 Middle Avenue        NaN   
8    9            9       2510 Middlefield Road        NaN   
9   10           10       1044 Middlefield Road        NaN   
10  11           11         2140 Euclid Avenue.        NaN   
11  12           12       1044 Middlefield Road  2nd Floor   
12  13           13         399 Marine Parkway.        NaN   
13  14           14          660 Veterans Blvd.        NaN   
14  15           15        1500 Valencia Street        NaN   
15  16  