In [1]:
import pandas as pd

In [2]:
def read_csv_files(file_paths):
    """
    Read multiple CSV files into a dictionary of DataFrames.

    Args:
        file_paths (dict): A dictionary containing file paths with keys as DataFrame names.

    Returns:
        dict: A dictionary containing DataFrames with keys as DataFrame names.
    """
    data_frames = {}
    for name, path in file_paths.items():
        data_frames[name] = pd.read_csv(path)
    return data_frames

In [56]:
file_paths = {
    "members": "data/members_v3.csv",
    "train": "data/train_v2.csv",
    "transactions": "data/transactions_v2.csv",
    "logs": "data/user_logs_v2.csv"
}

In [57]:
data_frames = read_csv_files(file_paths)

# Access individual DataFrames
members_v3 = data_frames["members"]
train_v2 = data_frames["train"]
transactions_v2 = data_frames["transactions"]
user_logs_v2 = data_frames["logs"]

## Compressing members_df

In [16]:
members_v3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6769473 entries, 0 to 6769472
Data columns (total 6 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   msno                    object
 1   city                    int64 
 2   bd                      int64 
 3   gender                  object
 4   registered_via          int64 
 5   registration_init_time  int64 
dtypes: int64(4), object(2)
memory usage: 309.9+ MB


In [17]:
members_v3.dropna(inplace=True)

In [18]:
members_v3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2339968 entries, 4 to 6769470
Data columns (total 6 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   msno                    object
 1   city                    int64 
 2   bd                      int64 
 3   gender                  object
 4   registered_via          int64 
 5   registration_init_time  int64 
dtypes: int64(4), object(2)
memory usage: 125.0+ MB


In [19]:
# Assuming members_df is your DataFrame
# Convert integer columns to float32
integer_cols = ['city', 'bd', 'registered_via', 'registration_init_time']
members_v3[integer_cols] = members_v3[integer_cols].astype('float32')

# Convert object columns to float32 if possible
for col in members_v3.select_dtypes(include=['object']).columns:
    try:
        members_v3[col] = members_v3[col].astype('float32')
    except ValueError:
        pass  # Cannot convert to float, leave it as object

# Check the data types after conversion
print(members_v3.dtypes)


msno                       object
city                      float32
bd                        float32
gender                     object
registered_via            float32
registration_init_time    float32
dtype: object


In [20]:
members_v3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2339968 entries, 4 to 6769470
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   msno                    object 
 1   city                    float32
 2   bd                      float32
 3   gender                  object 
 4   registered_via          float32
 5   registration_init_time  float32
dtypes: float32(4), object(2)
memory usage: 89.3+ MB


In [21]:
# Define the file path
file_path = 'data/members_v3.csv.gz'

# Export DataFrame to CSV file with gzip compression
members_v3.to_csv(file_path, index=False, compression='gzip')


## Compressing train_v2

In [22]:
train_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970960 entries, 0 to 970959
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   msno      970960 non-null  object
 1   is_churn  970960 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 14.8+ MB


In [31]:
# Define the file path
file_path = 'data/train_v2.csv.gz'

# Export DataFrame to CSV file with gzip compression
train_v2.to_csv(file_path, index=False, compression='gzip')

## ## Compressing transactions_v2

In [24]:
transactions_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1431009 entries, 0 to 1431008
Data columns (total 9 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   msno                    1431009 non-null  object
 1   payment_method_id       1431009 non-null  int64 
 2   payment_plan_days       1431009 non-null  int64 
 3   plan_list_price         1431009 non-null  int64 
 4   actual_amount_paid      1431009 non-null  int64 
 5   is_auto_renew           1431009 non-null  int64 
 6   transaction_date        1431009 non-null  int64 
 7   membership_expire_date  1431009 non-null  int64 
 8   is_cancel               1431009 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 98.3+ MB


In [27]:
transactions_v2.dropna(inplace=True)

In [28]:
transactions_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1431009 entries, 0 to 1431008
Data columns (total 9 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   msno                    1431009 non-null  object
 1   payment_method_id       1431009 non-null  int64 
 2   payment_plan_days       1431009 non-null  int64 
 3   plan_list_price         1431009 non-null  int64 
 4   actual_amount_paid      1431009 non-null  int64 
 5   is_auto_renew           1431009 non-null  int64 
 6   transaction_date        1431009 non-null  int64 
 7   membership_expire_date  1431009 non-null  int64 
 8   is_cancel               1431009 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 98.3+ MB


In [29]:
# Assuming transactions_v2 is your DataFrame

# Convert integer columns to float32
integer_cols = transactions_v2.select_dtypes(include=['int64']).columns
transactions_v2[integer_cols] = transactions_v2[integer_cols].astype('float32')

# Convert other numerical columns to float32
numerical_cols = transactions_v2.select_dtypes(exclude=['object']).columns
for col in numerical_cols:
    if col not in integer_cols:  # Skip columns already converted to float32
        transactions_v2[col] = transactions_v2[col].astype('float32')

# Check the data types after conversion
print(transactions_v2.dtypes)


msno                       object
payment_method_id         float32
payment_plan_days         float32
plan_list_price           float32
actual_amount_paid        float32
is_auto_renew             float32
transaction_date          float32
membership_expire_date    float32
is_cancel                 float32
dtype: object


In [30]:
transactions_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1431009 entries, 0 to 1431008
Data columns (total 9 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   msno                    1431009 non-null  object 
 1   payment_method_id       1431009 non-null  float32
 2   payment_plan_days       1431009 non-null  float32
 3   plan_list_price         1431009 non-null  float32
 4   actual_amount_paid      1431009 non-null  float32
 5   is_auto_renew           1431009 non-null  float32
 6   transaction_date        1431009 non-null  float32
 7   membership_expire_date  1431009 non-null  float32
 8   is_cancel               1431009 non-null  float32
dtypes: float32(8), object(1)
memory usage: 54.6+ MB


In [32]:
# Define the file path
file_path = 'data/transactions_v2.csv.gz'

# Export DataFrame to CSV file with gzip compression
transactions_v2.to_csv(file_path, index=False, compression='gzip')

## ## Compressing user_logs_v2

In [58]:
user_logs_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396362 entries, 0 to 18396361
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   msno        object 
 1   date        int64  
 2   num_25      int64  
 3   num_50      int64  
 4   num_75      int64  
 5   num_985     int64  
 6   num_100     int64  
 7   num_unq     int64  
 8   total_secs  float64
dtypes: float64(1), int64(7), object(1)
memory usage: 1.2+ GB


In [59]:
user_logs_v2.dropna(inplace=True)

In [60]:
user_logs_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396362 entries, 0 to 18396361
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   msno        object 
 1   date        int64  
 2   num_25      int64  
 3   num_50      int64  
 4   num_75      int64  
 5   num_985     int64  
 6   num_100     int64  
 7   num_unq     int64  
 8   total_secs  float64
dtypes: float64(1), int64(7), object(1)
memory usage: 1.2+ GB


In [61]:
# Convert all numerical columns to float32
numerical_cols = user_logs_v2.select_dtypes(include=['int64', 'float64']).columns
user_logs_v2[numerical_cols] = user_logs_v2[numerical_cols].astype('float32')

# Check the data types after conversion
print(user_logs_v2.dtypes)


msno           object
date          float32
num_25        float32
num_50        float32
num_75        float32
num_985       float32
num_100       float32
num_unq       float32
total_secs    float32
dtype: object


In [62]:
user_logs_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396362 entries, 0 to 18396361
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   msno        object 
 1   date        float32
 2   num_25      float32
 3   num_50      float32
 4   num_75      float32
 5   num_985     float32
 6   num_100     float32
 7   num_unq     float32
 8   total_secs  float32
dtypes: float32(8), object(1)
memory usage: 701.8+ MB


In [63]:
# Columns to drop
columns_to_drop = ['num_25', 'num_50', 'num_75', 'num_985', 'num_100']

# Drop columns
user_logs_v2 = user_logs_v2.drop(columns=columns_to_drop, axis=1)

In [64]:
user_logs_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396362 entries, 0 to 18396361
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   msno        object 
 1   date        float32
 2   num_unq     float32
 3   total_secs  float32
dtypes: float32(3), object(1)
memory usage: 350.9+ MB


In [65]:
user_logs_v2 = train_v2.merge(user_logs_v2, on='msno', how='left')

In [66]:
user_logs_v2.dropna(inplace=True)

In [67]:
user_logs_v2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13532944 entries, 0 to 13749351
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   msno        object 
 1   is_churn    int64  
 2   date        float32
 3   num_unq     float32
 4   total_secs  float32
dtypes: float32(3), int64(1), object(1)
memory usage: 464.6+ MB


In [70]:
# Undersample the DataFrame by 70%
undersampled_user_logs_v2 = user_logs_v2.sample(frac=0.15, random_state=42)


In [71]:
undersampled_user_logs_v2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2029942 entries, 7056252 to 8241382
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   msno        object 
 1   is_churn    int64  
 2   date        float32
 3   num_unq     float32
 4   total_secs  float32
dtypes: float32(3), int64(1), object(1)
memory usage: 69.7+ MB


In [73]:
# Define the file path
file_path = 'data/user_logs_v2.csv.gz'

# Export DataFrame to CSV file with gzip compression
undersampled_user_logs_v2.to_csv(file_path, index=False, compression='gzip')