In [14]:
import os
import pandas as pd

# Path to the folder with the CSV files
csv_folder = '../tubitakaiagentprojeleriiinverisetleri'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]

# Dictionary to store the DataFrames
dataframes = {}

# Import each CSV file as a DataFrame and merge duplicated columns
for file in csv_files:
    file_path = os.path.join(csv_folder, file)
    df_name = file.replace('.csv', '')
    df = pd.read_csv(file_path)
    # join the columns with the same base name
    base_cols = set([c.split('.')[0] for c in df.columns])
    for base in base_cols:
        same_cols = [c for c in df.columns if c.split('.')[0] == base]
        if len(same_cols) > 1:
            df[base] = df[same_cols].bfill(axis=1).iloc[:, 0]
            df = df.drop(columns=[c for c in same_cols if c != base])
    dataframes[df_name] = df

# Show the names of the imported DataFrames
list(dataframes.keys())

['AddressAdded',
 'BeforePurchaseDetailsScreen',
 'CatAdded',
 'CheckoutPageOpened',
 'CreditcardAdded',
 'DogAdded',
 'Purchase',
 'SignupCompleted']

In [None]:
# General analysis of all imported tables and their possible relations
from collections import Counter
import numpy as np


# Find possible relations (foreign keys) between tables
print('\nPossible relations between tables:')
all_columns = {name: set(df.columns) for name, df in dataframes.items()}
for name1, cols1 in all_columns.items():
    for name2, cols2 in all_columns.items():
        if name1 != name2:
            common = cols1 & cols2
            if common:
                print(f'- {name1} <-> {name2}: common columns: {list(common)}')


Possible relations between tables:
- AddressAdded <-> CatAdded: common columns: ['ownerid']
- AddressAdded <-> CreditcardAdded: common columns: ['ownerid']
- AddressAdded <-> DogAdded: common columns: ['ownerid']
- AddressAdded <-> Purchase: common columns: ['ownerid']
- BeforePurchaseDetailsScreen <-> CheckoutPageOpened: common columns: ['user_id', 'uuid', 'serviceType', 'event_time']
- CatAdded <-> AddressAdded: common columns: ['ownerid']
- CatAdded <-> CreditcardAdded: common columns: ['ownerid']
- CatAdded <-> DogAdded: common columns: ['gender', 'birthday', 'ownerid', 'breed', 'weight']
- CatAdded <-> Purchase: common columns: ['ownerid']
- CheckoutPageOpened <-> BeforePurchaseDetailsScreen: common columns: ['user_id', 'uuid', 'serviceType', 'event_time']
- CreditcardAdded <-> AddressAdded: common columns: ['ownerid']
- CreditcardAdded <-> CatAdded: common columns: ['ownerid']
- CreditcardAdded <-> DogAdded: common columns: ['ownerid']
- CreditcardAdded <-> Purchase: common colu

In [None]:
# Join all tables on 'owner_id' = 'user_id' and 'user_id' = 'user_id'
from functools import reduce

# Select only DataFrames that have 'user_id' or 'owner_id'
dfs_with_user = []
for name, df in dataframes.items():
    if 'user_id' in df.columns or 'ownerid' in df.columns or 'id' in df.columns:
        # Rename 'owner_id' and 'id' to 'user_id' for consistency
        temp = df.copy()
        if 'ownerid' in temp.columns:
            temp = temp.rename(columns={'ownerid': 'user_id'})
        if 'id' in temp.columns:
            temp = temp.rename(columns={'id': 'user_id'})
        dfs_with_user.append(temp)

# Join all DataFrames with 'user_id' using outer join
if dfs_with_user:
    joined_df = reduce(lambda left, right: pd.merge(left, right, on='user_id', how='outer', suffixes=('', '_dup')), dfs_with_user)
    print('Joined DataFrame shape:', joined_df.shape)
    display(joined_df.head())
else:
    print('No tables with user_id or owner_id found.')

Joined DataFrame shape: (1403189, 7)


Unnamed: 0,uuid,user_id,event_time,serviceType,uuid_dup,event_time_dup,serviceType_dup
0,7bf15790-ea0d-4bb8-bfb1-8c621c9a791d,00072d34-a056-461a-a26b-a7e2a19990ae,2025-06-20 21:40:25.706,Walking,,,
1,,0011ef23-7b3d-4cd0-8fb2-1b1fa90a6ac6,,,53aa1be4-f5a9-4582-b351-5533a3fed538,2025-07-02 11:16:45.598,Sitting
2,394857ac-4741-4b6e-8e19-91b76b6ad34b,0022c99c-41c2-4edf-a189-f2db73bc1da1,2025-06-07 00:21:30.633,Walking,,,
3,62fa94c5-022d-4274-8475-a6d7b5ac4c96,00564446-c7c5-4436-969a-7c2c9edf83ee,2025-07-09 23:48:40.079,Planned,,,
4,a80bf625-ecb4-4548-9817-23db1c02d945,00564446-c7c5-4436-969a-7c2c9edf83ee,2025-07-09 23:49:07.119,Planned,,,
