In [1]:
# Install necessary packages
!pip install PyAthena[SQLAlchemy]
!pip install CurrencyConverter

# Import necessary libraries
from sqlalchemy import create_engine
import pandas as pd
import boto3
from datetime import datetime, timedelta
from currency_converter import CurrencyConverter, ECB_URL
import numpy as np
import html
import json
import re
from functools import reduce
import pytz

# Set time and environment
today = datetime.now() - timedelta(days=1)
prod = True

# S3 path for Athena
s3_staging_dir = "s3://ets-aws-plalab-dii-prod-analyticsbucket-1ktrlhzbrcbkb/athena_query_results/"

# Athena connection string
connection_string = f"awsathena+rest://:@athena.us-east-1.amazonaws.com:443/labsprodeventsdatabase-x806vjuzpbrd?s3_staging_dir={s3_staging_dir}"
engine = create_engine(connection_string)

# Function to pull Parquet files from S3
def pull_parquet_files_pandas_Target_features(file_name, bucket_name='ets-dii-testready-ds-analytics'):
    s3_client = boto3.client('s3')
    prefix = f'Eben/Features/file_name={file_name}/'

    def list_parquet_files(bucket, prefix):
        file_paths = []
        paginator = s3_client.get_paginator('list_objects_v2')
        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
            for content in page.get('Contents', []):
                key = content.get('Key')
                if key.endswith('.parquet'):
                    file_paths.append(f"s3://{bucket}/{key}")
        return file_paths

    file_paths = list_parquet_files(bucket_name, prefix)
    df = pd.DataFrame()
    for file_path in file_paths:
        temp_df = pd.read_parquet(file_path, engine='pyarrow')
        df = pd.concat([df, temp_df], ignore_index=True)
    return df

# Load the datasets
free_test_section = pull_parquet_files_pandas_Target_features(file_name='free_test_section')
target = pull_parquet_files_pandas_Target_features(file_name='Target')
modifile_free_test = pull_parquet_files_pandas_Target_features(file_name='modifile_free_test')
modifile_max_streak = pull_parquet_files_pandas_Target_features(file_name='modifile_max_streak')
modifile_study_plan = pull_parquet_files_pandas_Target_features(file_name='modifile_study_plan')
page_view = pull_parquet_files_pandas_Target_features(file_name='page_view')
paid_prep = pull_parquet_files_pandas_Target_features(file_name='paid_prep')
view_feedback = pull_parquet_files_pandas_Target_features(file_name='view_feedback')
view_plan = pull_parquet_files_pandas_Target_features(file_name='view_plan')

# Check sample sizes before merging
print("Sample size before merging:")
datasets = {
    "free_test_section": free_test_section,
    "target": target,
    "modifile_free_test": modifile_free_test,
    "modifile_max_streak": modifile_max_streak,
    "modifile_study_plan": modifile_study_plan,
    "page_view": page_view,
    "paid_prep": paid_prep,
    "view_feedback": view_feedback,
    "view_plan": view_plan
}

for name, df in datasets.items():
    print(f"{name}: {len(df)}")

# Merge the datasets with explicit suffixes to avoid conflicts
merged_df = target.merge(free_test_section, on='user_id', how='left', suffixes=('', '_free_test_section')) \
                  .merge(modifile_free_test, on='user_id', how='left', suffixes=('', '_modifile_free_test')) \
                  .merge(modifile_max_streak, on='user_id', how='left', suffixes=('', '_modifile_max_streak')) \
                  .merge(modifile_study_plan, on='user_id', how='left', suffixes=('', '_modifile_study_plan')) \
                  .merge(page_view, on='user_id', how='left', suffixes=('', '_page_view')) \
                  .merge(paid_prep, on='user_id', how='left', suffixes=('', '_paid_prep')) \
                  .merge(view_feedback, on='user_id', how='left', suffixes=('', '_view_feedback')) \
                  .merge(view_plan, on='user_id', how='left', suffixes=('', '_view_plan'))

# Drop the unwanted columns while retaining COUNTRY
columns_to_drop = [col for col in merged_df.columns if re.search(r'country_code|first_date|country(?!_name)', col)]
merged_df = merged_df.drop(columns=columns_to_drop)

# Check sample size after merging
print(f"Sample size after merging: {len(merged_df)}")

# Save the final merged dataframe to a CSV file
final_csv_path = 'merged_dataset_final.parquet'
merged_df.to_parquet(final_csv_path, index=False)

# Save the final merged dataframe to a CSV file
final_csv_path = 'merged_dataset_final.csv'
merged_df.to_csv(final_csv_path, index=False)

print(f"Final merged data saved to: {final_csv_path}")



  engine = create_engine(connection_string)


Sample size before merging:
free_test_section: 1802
target: 6000
modifile_free_test: 1921
modifile_max_streak: 4494
modifile_study_plan: 4336
page_view: 4953
paid_prep: 1948
view_feedback: 2061
view_plan: 1556
Sample size after merging: 6000
Final merged data saved to: merged_dataset_final.csv
