### For Local Files ###

In [100]:
from collections import Counter
import pandas as pd
import chardet


def check_file_validity(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()
    if not lines:
        return {"error": "File is empty", "file_valid": False}

    delimiter_obj = check_delimiter(file_lines=lines)
    if not delimiter_obj.get("delimiter_found"):
        return {"error": delimiter_obj.get("error"), "file_valid": False}

    # delimiter = delimiter_obj.get("delimiter")
    # num_columns = delimiter_obj.get("column_numbers")

    # column_consistency_obj = check_column_consistency(
    #     file_lines=lines, delimiter=delimiter, num_columns=num_columns
    # )
    # if not column_consistency_obj.get("column_consistent"):
    #     return {"error": column_consistency_obj.get("error"), "file_valid": False}

    used_encoding = get_file_encoding(file_url=file_path)

    return {"error": None, "file_valid": True, "file_encoding": used_encoding}


def check_delimiter(file_lines):
    valid_delimiters = [",", ";", ":", "\t", " ", "|"]
    first_line = file_lines[0]
    delimiter_counts = Counter(ch for ch in first_line if ch in valid_delimiters)

    if not delimiter_counts:
        return {
            "error": "Cannot determine a consistent delimiter",
            "delimiter_found": False,
            "column_numbers": None,
            "delimiter": None,
        }

    delimiter, _ = delimiter_counts.most_common(1)[0]
    column_numbers = len(first_line.split(delimiter))

    for line in file_lines:
        row_delimiter_count = Counter(ch for ch in line if ch in delimiter)
        if not row_delimiter_count:
            return {
                "error": "Cannot determine a consistent delimiter",
                "delimiter_found": False,
                "column_numbers": None,
                "delimiter": None,
            }

    return {
        "error": None,
        "delimiter_found": True,
        "column_numbers": column_numbers,
        "delimiter": delimiter,
    }


# def check_column_consistency(file_lines, delimiter, num_columns):
#     for idx, line in enumerate(file_lines):
#         columns = line.split(delimiter)
#         if len(columns) != num_columns:
#             return {
#                 "error": f"Inconsistent number of columns on line {idx + 1}",
#                 "column_consistent": False,
#             }

#     return {"error": None, "column_consistent": True}


def get_file_encoding(file_url):
    with open(file_url, "rb") as file:
        opened_file = file.read()
    current_encoding = chardet.detect(opened_file).get("encoding", "utf-8")
    print(f"used encoding is : {current_encoding}")
    return current_encoding


def get_dataframe(file_url):
    validity_obj = check_file_validity(file_path=file_url)
    if not validity_obj.get("file_valid"):
        return {
            "error": validity_obj.get("error"),
            "file_valid": False,
            "dataframe": None,
        }
    try:
        df = pd.read_csv(
            file_url,
            sep=validity_obj.get("delimiter"),
            encoding=validity_obj.get("file_encoding"),
            engine="python",
        )
    except Exception as e:
        return {"error": f"Failed to read the file: {str(e)}", "file_valid": False}

    return {"error": None, "file_valid": True, "dataframe": df}

In [103]:
# analyze_file("test.txt")
get_dataframe("test.txt")


{'error': 'Cannot determine a consistent delimiter',
 'file_valid': False,
 'dataframe': None}

### For File URL ###


In [95]:
from collections import Counter
import requests
import pandas as pd


def check_file_validity(file_url):
    response = requests.get(file_url)
    if response.status_code != 200:
        return {
            "error": f"Failed to fetch file. Status code: {response.status_code}",
            "file_valid": False,
        }

    content = response.text
    lines = content.splitlines()
    if not lines:
        return {"error": "File is empty", "file_valid": False}

    delimiter_obj = check_delimiter(file_lines=lines)
    if not delimiter_obj.get("delimiter_found"):
        return {"error": delimiter_obj.get("error"), "file_valid": False}

    delimiter = delimiter_obj.get("delimiter")
    file_encoding = get_file_encoding(file_content=response.content)
    return {
        "error": None,
        "file_valid": True,
        "delimiter": delimiter,
        "file_encoding": file_encoding,
    }


def check_delimiter(file_lines):
    valid_delimiters = [",", "#", ";", ":", "\t", " ", "|"]
    first_line = file_lines[0]
    delimiter_counts = Counter(ch for ch in first_line if ch in valid_delimiters)

    if not delimiter_counts:
        return {
            "error": "Cannot determine a consistent delimiter",
            "delimiter_found": False,
            "column_numbers": None,
            "delimiter": None,
        }

    delimiter, _ = delimiter_counts.most_common(1)[0]
    column_numbers = len(first_line.split(delimiter))

    for line in file_lines:
        row_delimiter_count = Counter(ch for ch in line if ch in delimiter)
        if not row_delimiter_count:
            return {
                "error": "Cannot determine a consistent delimiter",
                "delimiter_found": False,
                "column_numbers": None,
                "delimiter": None,
            }

    return {
        "error": None,
        "delimiter_found": True,
        "column_numbers": column_numbers,
        "delimiter": delimiter,
    }


def get_file_encoding(file_content):
    file_encoding = chardet.detect(file_content).get("encoding", "utf-8")
    return file_encoding


def get_dataframe(file_url):
    validity_obj = check_file_validity(file_url=file_url)
    if not validity_obj.get("file_valid"):
        return {
            "error": validity_obj.get("error"),
            "file_valid": False,
            "dataframe": None,
        }
    try:
        df = pd.read_csv(file_url, sep=validity_obj.get("delimiter"))
    except Exception as e:
        return {"error": f"Failed to read the file: {str(e)}", "file_valid": False}

    return {"error": None, "file_valid": True, "dataframe": df}

In [97]:
result_obj = get_dataframe(file_url="https://raw.githubusercontent.com/codeforamerica/ohana-api/refs/heads/master/data/sample-csv/addresses.csv")
print(result_obj.get("dataframe"))

used encoding is : ascii
    id  location_id                   address_1  address_2  \
0    1            1       2600 Middlefield Road        NaN   
1    2            2            24 Second Avenue        NaN   
2    3            3            24 Second Avenue        NaN   
3    4            4            24 Second Avenue        NaN   
4    5            5            24 Second Avenue        NaN   
5    6            6           800 Middle Avenue        NaN   
6    7            7              500 Arbor Road        NaN   
7    8            8           800 Middle Avenue        NaN   
8    9            9       2510 Middlefield Road        NaN   
9   10           10       1044 Middlefield Road        NaN   
10  11           11         2140 Euclid Avenue.        NaN   
11  12           12       1044 Middlefield Road  2nd Floor   
12  13           13         399 Marine Parkway.        NaN   
13  14           14          660 Veterans Blvd.        NaN   
14  15           15        1500 Valencia Stre