In [1]:
import os
import polars as pl

# Import Dataset

In [2]:
local_path='dataset'
auto='auto'

In [3]:
# get current directory
path = os.getcwd()
print("Current Directory", path)
print()
 
# parent directory
parent = os.path.dirname(path)
print("Parent directory", parent)
print()

# dataset path
austin_path=os.path.join(parent,local_path,'listings_austin.csv')
bangkok_path=os.path.join(parent,local_path,'listings_bangkok.csv')
buenos_aires_path=os.path.join(parent,local_path,'listings_buenos_aires.csv')
cape_town_path=os.path.join(parent,local_path,'listings_cape_town.csv')
istanbul_path=os.path.join(parent,local_path,'listings_istanbul.csv')
melbourne_path=os.path.join(parent,local_path,'listings_melbourne.csv')

dataset_path=[austin_path,bangkok_path,buenos_aires_path,cape_town_path,istanbul_path,melbourne_path]
print("Dataset path",dataset_path)
print()

# auto path
path_auto=os.path.join(parent,auto)
print("Dump path",path_auto)

Current Directory /Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb/notebook

Parent directory /Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb

Dataset path ['/Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb/dataset/listings_austin.csv', '/Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb/dataset/listings_bangkok.csv', '/Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb/dataset/listings_buenos_aires.csv', '/Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb/dataset/listings_cape_town.csv', '/Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb/dataset/listings_istanbul.csv', '/Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb/dataset/listings_melbourne.csv']

Dump path /Users/baptistebeaurain/Google Drive/learning/Python/polars_explo/project/airbnb

In [4]:
df_austin=pl.read_csv(austin_path)
df_bangkok=pl.read_csv(bangkok_path)
df_buenos_aires=pl.read_csv(buenos_aires_path)
df_cape_town=pl.read_csv(cape_town_path)
df_istanbul=pl.read_csv(istanbul_path)
df_melbourne=pl.read_csv(melbourne_path)

datasets=[df_austin,df_bangkok,df_buenos_aires,df_cape_town,df_istanbul,df_melbourne]


## Function to return dataframes shapes
def get_dataframes_and_shapes(dataframes):
    result = {}
    
    for df in dataframes:
        df_name = [name for name, frame in globals().items() if frame is df][0]
        result[df_name] = {'dataframe': df_name, 'shape': df.shape}
    
    return result

##################

print(get_dataframes_and_shapes(datasets))


{'df_austin': {'dataframe': 'df_austin', 'shape': (11269, 18)}, 'df_bangkok': {'dataframe': 'df_bangkok', 'shape': (17431, 18)}, 'df_buenos_aires': {'dataframe': 'df_buenos_aires', 'shape': (17671, 18)}, 'df_cape_town': {'dataframe': 'df_cape_town', 'shape': (16891, 18)}, 'df_istanbul': {'dataframe': 'df_istanbul', 'shape': (22539, 18)}, 'df_melbourne': {'dataframe': 'df_melbourne', 'shape': (18016, 18)}}


## Make sure each datasets have the same columns

In [5]:
## Function to compare dataframes columns
def compare_columns(dataframes):
    if not dataframes:
        # If the list is empty, return empty lists
        return [], []
    
    reference_columns = set(dataframes[0].columns)
    
    common_columns = set(dataframes[0].columns)
    different_columns = set()
    
    for df in dataframes[1:]:
        common_columns = common_columns.intersection(set(df.columns))
        different_columns = different_columns.union(set(df.columns) - reference_columns)
    
    return list(common_columns), list(different_columns)

##################

print(compare_columns(datasets))

(['latitude', 'price', 'longitude', 'calculated_host_listings_count', 'name', 'number_of_reviews_ltm', 'number_of_reviews', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'minimum_nights', 'availability_365', 'license', 'id', 'last_review', 'room_type', 'reviews_per_month'], [])


## Make sure each column are the same type

In [6]:
### Function checking that all dataframes in a dataframe list are the same stype
def check_consistent_column_types(dataframes):
    if not dataframes:
        return []

    # Extract the column names from the first dataset
    column_names = dataframes[0].columns

    # Collect columns with inconsistent types
    inconsistent_columns = []

    for column_name in column_names:
        first_column_type = dataframes[0][column_name].dtype

        for dataset in dataframes[1:]:
            if column_name in dataset.columns:
                if dataset[column_name].dtype != first_column_type:
                    inconsistent_columns.append(column_name)
                    break  # No need to check further for this column

    return inconsistent_columns

##################



In [7]:
check_consistent_column_types(datasets)

['neighbourhood']

## Convert columns of type not matching into String

In [8]:
def convert_columns_to_string(dataframes, column_names):
    
    output=[]

    for i, dataset in enumerate(dataframes):
        for column_name in column_names:
            # print(column_name)
            if column_name in dataset.columns:
                
                col_name_old=column_name + "_old"  
                # Create column as Str
                dataset=dataset.with_columns(
                    pl.col(column_name).cast(pl.Utf8, strict=False).alias(col_name_old)
                )
                # Remove old non Str column and rename the new
                dataset=dataset.drop(column_name).rename({col_name_old:column_name})
                output.append(dataset)
    return output

In [9]:

datasets_str=convert_columns_to_string(datasets, check_consistent_column_types(datasets))

## Concatenate datasets

In [10]:
def concatenate_datasets(dataset_list):
    if not dataset_list:
        return None
    
    # Assuming that all datasets have the same schema
    concatenated_dataset = dataset_list[0]

    for dataset in dataset_list[1:]:
        concatenated_dataset = pl.concat([concatenated_dataset, dataset])

    return concatenated_dataset



In [13]:
df=concatenate_datasets(datasets_str)

df.write_csv(os.path.join(path_auto,'concat.csv'))

df.shape

(103817, 18)