In [6]:
import os
import pandas as pd

# Path to the folder with the CSV files
csv_folder = '../tubitakaiagentprojeleriiinverisetleri/'

import pandas as pd

# Load the datasets
try:
    purchase_df = pd.read_csv(csv_folder+'Purchase.csv')
    before_purchase_df = pd.read_csv(csv_folder+'BeforePurchaseDetailsScreen.csv')
    dog_added_df = pd.read_csv(csv_folder+'DogAdded.csv')
    cat_added_df = pd.read_csv(csv_folder+'CatAdded.csv')
    checkout_opened_df = pd.read_csv(csv_folder+'CheckoutPageOpened.csv')
    address_added_df = pd.read_csv(csv_folder+'AddressAdded.csv')
    sign_up_df = pd.read_csv(csv_folder+'SignUpCompleted.csv')
    creditcard_added_df = pd.read_csv(csv_folder+'CreditcardAdded.csv')

    # Inspect each dataframe
    print("Purchase.csv info and head:")
    purchase_df.info()
    purchase_df.head()

    print("\nBeforePurchaseDetailsScreen.csv info and head:")
    before_purchase_df.info()
    before_purchase_df.head()

    print("\nDogAdded.csv info and head:")
    dog_added_df.info()
    dog_added_df.head()

    print("\nCatAdded.csv info and head:")
    cat_added_df.info()
    cat_added_df.head()

    print("\nCheckoutPageOpened.csv info and head:")
    checkout_opened_df.info()
    checkout_opened_df.head()
    
    print("\nAddressAdded.csv info and head:")
    address_added_df.info()
    address_added_df.head()
    
    print("\nSignUpCompleted.csv info and head:")
    sign_up_df.info()
    sign_up_df.head()
    
    print("\nCreditcardAdded.csv info and head:")
    creditcard_added_df.info()
    creditcard_added_df.head()

except FileNotFoundError as e:
    print(f"Error loading files: {e}. Please ensure all files are uploaded.")
except Exception as e:
    print(f"An error occurred: {e}")

Purchase.csv info and head:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28770 entries, 0 to 28769
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   serviceid         28770 non-null  object
 1   ownerid           28768 non-null  object
 2   ordercreatedtime  28770 non-null  object
 3   servicetype       28770 non-null  object
dtypes: object(4)
memory usage: 899.2+ KB

BeforePurchaseDetailsScreen.csv info and head:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36911 entries, 0 to 36910
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           15209 non-null  object
 1   user_id        15209 non-null  object
 2   event_time     15209 non-null  object
 3   serviceType    15209 non-null  object
 4   uuid.1         21702 non-null  object
 5   user_id.1      21702 non-null  object
 6   event_time.1   21702 non-null  object
 7 

In [None]:
# --- Data Cleaning and Preparation ---

# 1. Clean BeforePurchaseDetailsScreen.csv
# The file has a strange structure with duplicated columns. We need to merge them.
before_purchase_part1 = before_purchase_df[['uuid', 'user_id', 'event_time', 'serviceType']].dropna()
before_purchase_part2 = before_purchase_df[['uuid.1', 'user_id.1', 'event_time.1', 'serviceType.1']].dropna()
before_purchase_part2.columns = ['uuid', 'user_id', 'event_time', 'serviceType']
before_purchase_df_cleaned = pd.concat([before_purchase_part1, before_purchase_part2], ignore_index=True)
before_purchase_df_cleaned.rename(columns={'user_id': 'ownerid'}, inplace=True)

before_purchase_df_cleaned['event_time'] = pd.to_datetime(before_purchase_df_cleaned['event_time'], errors='coerce')

# 2. Clean CheckoutPageOpened.csv
checkout_opened_df.rename(columns={'user_id': 'ownerid'}, inplace=True)

# 3. Clean Purchase.csv
purchase_df.dropna(subset=['ownerid'], inplace=True)
purchase_df.rename(columns={'servicetype': 'serviceType'}, inplace=True)

# 4. Clean and merge DogAdded.csv and CatAdded.csv
dog_added_df['pet_type'] = 'dog'
dog_added_df.rename(columns={'dogid': 'petid'}, inplace=True)

cat_added_df['pet_type'] = 'cat'
cat_added_df.rename(columns={'catid': 'petid'}, inplace=True)

pets_df = pd.concat([dog_added_df, cat_added_df], ignore_index=True)

# 5. Clean AddressAdded.csv
address_added_df_cleaned = address_added_df.dropna(subset=['user_id', 'addressid'])
address_added_df_cleaned.rename(columns={'user_id': 'ownerid'}, inplace=True)

# 6. Clean SignUpCompleted.csv
sign_up_df_cleaned = sign_up_df.dropna(subset=['user_id'])
sign_up_df_cleaned.rename(columns={'user_id': 'ownerid'}, inplace=True)

# 7. Clean CreditcardAdded.csv
creditcard_added_df_cleaned = creditcard_added_df.dropna(subset=['user_id', 'creditcardid'])
creditcard_added_df_cleaned.rename(columns={'user_id': 'ownerid'}, inplace=True)

# --- Merging DataFrames ---

# Merge checkout and purchase data to create a conversion funnel
# We are considering a conversion if a user who opened the checkout page for a service type made a purchase for the same service type.
# To do that, first we will aggregate the checkout and purchase data by ownerid and serviceType
checkout_agg = checkout_opened_df.groupby(['ownerid', 'serviceType']).size().reset_index(name='checkout_count')
purchase_agg = purchase_df.groupby(['ownerid', 'serviceType']).size().reset_index(name='purchase_count')

# Now, we will merge these aggregated dataframes
conversion_df = pd.merge(checkout_agg, purchase_agg, on=['ownerid', 'serviceType'], how='left')
conversion_df['purchase_count'].fillna(0, inplace=True)
conversion_df['converted'] = conversion_df['purchase_count'] > 0

# Merge conversion data with pet data
merged_df = pd.merge(conversion_df, pets_df, on='ownerid', how='left')

# Merge with address, sign up e creditcard info
merged_df = pd.merge(merged_df, address_added_df_cleaned, on='ownerid', how='left', suffixes=('', '_address'))
merged_df = pd.merge(merged_df, sign_up_df_cleaned, on='ownerid', how='left', suffixes=('', '_signup'))
merged_df = pd.merge(merged_df, creditcard_added_df_cleaned, on='ownerid', how='left', suffixes=('', '_creditcard'))


Possible relations between tables:
- AddressAdded <-> CatAdded: common columns: ['ownerid']
- AddressAdded <-> CreditcardAdded: common columns: ['ownerid']
- AddressAdded <-> DogAdded: common columns: ['ownerid']
- AddressAdded <-> Purchase: common columns: ['ownerid']
- BeforePurchaseDetailsScreen <-> CheckoutPageOpened: common columns: ['user_id', 'uuid', 'serviceType', 'event_time']
- CatAdded <-> AddressAdded: common columns: ['ownerid']
- CatAdded <-> CreditcardAdded: common columns: ['ownerid']
- CatAdded <-> DogAdded: common columns: ['ownerid', 'gender', 'birthday', 'weight', 'breed']
- CatAdded <-> Purchase: common columns: ['ownerid']
- CheckoutPageOpened <-> BeforePurchaseDetailsScreen: common columns: ['user_id', 'uuid', 'serviceType', 'event_time']
- CreditcardAdded <-> AddressAdded: common columns: ['ownerid']
- CreditcardAdded <-> CatAdded: common columns: ['ownerid']
- CreditcardAdded <-> DogAdded: common columns: ['ownerid']
- CreditcardAdded <-> Purchase: common colu

MemoryError: Unable to allocate 9.95 GiB for an array with shape (11, 121370913) and data type object