# myDataFetching1: STATISTICS_PLAYERS_ALL.JSON into CSV

In [None]:
import json
import pandas as pd
import boto3
from io import StringIO
from datetime import datetime
import time
import concurrent.futures

s3 = boto3.client('s3')

def lambda_handler(event, context):
    start_time = time.time()
    source_bucket_name = 'dwlprojectbucket'
    target_bucket_name = 'datalakepartition2'
    
    # Only process statistics_players_all.json files
    pattern = 'statistics_players_all.json'
    
    # List all objects in the source bucket matching the pattern
    files_to_process = list_objects_with_pagination(source_bucket_name, pattern)
    print(f"Time to list objects in source bucket: {time.time() - start_time} seconds")
    
    if not files_to_process:
        print("No files to process that match the pattern.")
        return {
            'statusCode': 200,
            'body': json.dumps('No files to process that match the pattern.')
        }
    
    # Use a ThreadPoolExecutor for concurrent processing
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for key in files_to_process:
            # Extract the date and time from the filename and ensure it is formatted correctly
            date_time = key.split(' - ')[0].strip()  # Adjust this split based on your actual filename pattern
            output_file_key = f'{date_time} - {pattern.split(".")[0]}.csv'
            
            # Schedule the file processing
            futures.append(executor.submit(process_file_if_not_exists, source_bucket_name, target_bucket_name, key, transform_statistics_players_all, output_file_key))
        
        # Wait for all futures to complete
        for future in concurrent.futures.as_completed(futures):
            future.result()

    print(f"Total execution time: {time.time() - start_time} seconds")
    return {
        'statusCode': 200,
        'body': json.dumps('CSV files created successfully')
    }

def process_file_if_not_exists(source_bucket_name, target_bucket_name, key, transform_func, output_file_key):
    try:
        # Process the file regardless of its existence in the target bucket
        print(f"Processing file: {key}")
        process_file(source_bucket_name, target_bucket_name, key, transform_func, output_file_key)
        
        # Load the data into a DataFrame and inspect it
        obj = s3.get_object(Bucket=source_bucket_name, Key=key)
        raw_data = obj['Body'].read().decode('utf-8').splitlines()
        
        # Apply the transformation function to get the DataFrame
        extracted_df = transform_func(raw_data)
        
        # Display the DataFrame
        print(f"DataFrame loaded for file {key}:\n", extracted_df.head())  # Show the first few rows for inspection
        
    except Exception as e:
        print(f"Error processing file {key}: {e}")

def check_file_exists(bucket_name, key):
    try:
        s3.head_object(Bucket=bucket_name, Key=key)
        return True
    except:
        return False

def list_objects_with_pagination(bucket_name, pattern):
    files_to_process = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name)
        
        if 'Contents' in response:
            for obj in response['Contents']:
                if obj['Key'].endswith(pattern):
                    files_to_process.append(obj['Key'])
        
        if response.get('IsTruncated'):
            continuation_token = response.get('NextContinuationToken')
        else:
            break
    
    return files_to_process

def process_file(source_bucket_name, target_bucket_name, key, transform_func, output_file_key):
    try:
        # Get the file from the source S3 bucket
        fetch_start_time = time.time()
        obj = s3.get_object(Bucket=source_bucket_name, Key=key)
        print(f"Time to fetch file {key}: {time.time() - fetch_start_time} seconds")

        read_start_time = time.time()
        raw_data = obj['Body'].read().decode('utf-8').splitlines()
        print(f"Time to read and decode file {key}: {time.time() - read_start_time} seconds")
        
        # Apply the transformation function
        transform_start_time = time.time()
        extracted_df = transform_func(raw_data)
        print(f"Time to transform file {key}: {time.time() - transform_start_time} seconds")
        
        if extracted_df.empty:
            print(f"No data extracted from file {key}")
            return

        # Convert DataFrame to CSV
        csv_buffer = StringIO()
        csv_start_time = time.time()
        extracted_df.to_csv(csv_buffer, index=False)
        print(f"Time to convert DataFrame to CSV for file {key}: {time.time() - csv_start_time} seconds")
        
        # Upload the CSV to the target S3 bucket
        upload_start_time = time.time()
        s3.put_object(Bucket=target_bucket_name, Key=output_file_key, Body=csv_buffer.getvalue())
        print(f"Time to upload CSV for file {key}: {time.time() - upload_start_time} seconds")
    
    except Exception as e:
        print(f"Error processing file {key}: {e}")

# Transformation function for statistics_players_all.json

def transform_statistics_players_all(raw_data):
    extracted_data = []
    
    for line in raw_data:
        try:
            # First parse the line to get the list of JSON strings
            outer_list = json.loads(line)
            for json_str in outer_list:
                # Then parse each JSON string in the list
                fixture = json.loads(json_str)
                fixture_id = fixture['parameters']['fixture']
                for team_data in fixture['response']:
                    team = team_data['team']
                    team_id = team['id']
                    team_name = team['name']
                    
                    for player_data in team_data['players']:
                        player = player_data['player']
                        statistics = player_data['statistics'][0]  # assuming there's always one statistics entry per player

                        player_id = player['id']
                        player_name = player['name']
                        
                        extracted_data.append({
                            'fixtureID': fixture_id,
                            'team_Id': team_id,
                            'team_name': team_name,
                            'player_id': player_id,
                            'player_name': player_name,
                            'minutes': statistics['games'].get('minutes'),
                            'number': statistics['games'].get('number'),
                            'position': statistics['games'].get('position'),
                            'rating': statistics['games'].get('rating'),
                            'captain': statistics['games'].get('captain'),
                            'substitute': statistics['games'].get('substitute'),
                            'offsides': statistics.get('offsides'),
                            'shots_total': statistics['shots'].get('total'),
                            'shots_on': statistics['shots'].get('on'),
                            'goals_total': statistics['goals'].get('total'),
                            'goals_conceded': statistics['goals'].get('conceded'),
                            'goals_assists': statistics['goals'].get('assists'),
                            'saves': statistics['goals'].get('saves'),
                            'passes_total': statistics['passes'].get('total'),
                            'passes_key': statistics['passes'].get('key'),
                            'passes_accuracy': statistics['passes'].get('accuracy'),
                            'tackles_total': statistics['tackles'].get('total'),
                            'tackles_blocks': statistics['tackles'].get('blocks'),
                            'tackles_interceptions': statistics['tackles'].get('interceptions'),
                            'duels_total': statistics['duels'].get('total'),
                            'duels_won': statistics['duels'].get('won'),
                            'dribbles_attempts': statistics['dribbles'].get('attempts'),
                            'dribbles_success': statistics['dribbles'].get('success'),
                            'dribbles_past': statistics['dribbles'].get('past'),
                            'fouls_drawn': statistics['fouls'].get('drawn'),
                            'fouls_committed': statistics['fouls'].get('committed'),
                            'yellow_cards': statistics['cards'].get('yellow'),
                            'red_cards': statistics['cards'].get('red'),
                            'penalty_won': statistics['penalty'].get('won'),
                            'penalty_committed': statistics['penalty'].get('commited'),
                            'penalty_scored': statistics['penalty'].get('scored'),
                            'penalty_missed': statistics['penalty'].get('missed'),
                            'penalty_saved': statistics['penalty'].get('saved')
                        })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(extracted_data)

    return df


# myDataFetching2: PREDICTIONS16DAYS.JSON into CSV

In [None]:
import json
import pandas as pd
import boto3
from io import StringIO
from datetime import datetime

s3 = boto3.client('s3')

def lambda_handler(event, context):
    source_bucket_name = 'dwlprojectbucket'
    target_bucket_name = 'datalakepartition2'
    
    # Get the list of existing keys in the target bucket
    existing_keys = get_existing_keys(target_bucket_name)
    
    # List all objects in the source bucket
    response = s3.list_objects_v2(Bucket=source_bucket_name)
    for obj in response.get('Contents', []):
        key = obj['Key']
        # Process only files that match the specified pattern and do not exist in the target bucket
        if key.endswith('predictions16days.json'):
            # Extract date and time from the file name
            date_time = key.split(' - ')[0]
            output_file_key = f'{date_time} - predictions16days.csv'
            if output_file_key not in existing_keys:
                process_file(source_bucket_name, target_bucket_name, key, transform_predictions_16days, output_file_key)

    return {
        'statusCode': 200,
        'body': json.dumps('CSV file created successfully for predictions16days.json')
    }

def get_existing_keys(bucket_name):
    existing_keys = set()
    continuation_token = None
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in response:
            for obj in response['Contents']:
                existing_keys.add(obj['Key'])
        if response.get('IsTruncated'):
            continuation_token = response.get('NextContinuationToken')
        else:
            break
    return existing_keys

def process_file(source_bucket_name, target_bucket_name, key, transform_func, output_file_key):
    try:
        # Get the file from the source S3 bucket
        obj = s3.get_object(Bucket=source_bucket_name, Key=key)
        raw_data = obj['Body'].read().decode('utf-8').splitlines()
        
        # Apply the transformation function
        extracted_df = transform_func(raw_data)
        
        # Convert DataFrame to CSV
        csv_buffer = StringIO()
        extracted_df.to_csv(csv_buffer, index=False)
        
        # Upload the CSV to the target S3 bucket
        s3.put_object(Bucket=target_bucket_name, Key=output_file_key, Body=csv_buffer.getvalue())
    
    except Exception as e:
        print(f"Error processing file {key}: {e}")

def transform_predictions_16days(raw_data):
    data = json.loads('\n'.join(raw_data))  # Combine lines and load JSON
    extracted_data = []

    def safe_get(d, keys, default=None):
        for key in keys:
            try:
                d = d[key]
            except (KeyError, IndexError, TypeError):
                return default
        return d

    def kelvin_to_celsius(kelvin):
        return kelvin - 273.15

    for city_data in data:
        city_info = city_data['city']
        city_id = city_info['id']
        city_name = city_info['name']
        coord_lon = city_info['coord']['lon']
        coord_lat = city_info['coord']['lat']
        country = city_info['country']
        population = city_info['population']
        timezone = city_info['timezone']
        
        weather_list = city_data['list']
        
        for weather in weather_list:
            dt = safe_get(weather, ['dt'])
            temp_day_kelvin = safe_get(weather, ['temp', 'day'])
            pressure = safe_get(weather, ['pressure'])
            humidity = safe_get(weather, ['humidity'])
            weather_description = safe_get(weather, ['weather', 0, 'description'])
            wind_speed = safe_get(weather, ['speed'])
            wind_deg = safe_get(weather, ['deg'])
            rain = safe_get(weather, ['rain'], 0)

            # Convert temperatures from Kelvin to Celsius
            temp_day_celsius = kelvin_to_celsius(temp_day_kelvin) if temp_day_kelvin is not None else None
            
            # Convert Unix timestamp to human-readable format
            if dt is not None:
                dt_obj = datetime.utcfromtimestamp(dt)
                date_str = dt_obj.date().isoformat()
                time_str = dt_obj.time().isoformat()
            else:
                date_str = time_str = None
            
            extracted_data.append({
                'city_id': city_id,
                'city_name': city_name,
                'coord_lon': coord_lon,
                'coord_lat': coord_lat,
                'country': country,
                'population': population,
                'timezone': timezone,
                'date': date_str,
                'time': time_str,
                'temp_day': temp_day_celsius,
                'pressure': pressure,
                'humidity': humidity,
                'weather_description': weather_description,
                'wind_speed': wind_speed,
                'wind_deg': wind_deg,
                'rain': rain
            })

    return pd.DataFrame(extracted_data)


#  myDataFetching3: PREDICTIONS4DAYS.JSON into CSV 

In [None]:
import json
import pandas as pd
import boto3
from io import StringIO
from datetime import datetime

s3 = boto3.client('s3')

def lambda_handler(event, context):
    source_bucket_name = 'dwlprojectbucket'
    target_bucket_name = 'datalakepartition2'
    
    # Get the list of existing keys in the target bucket
    existing_keys = get_existing_keys(target_bucket_name)
    print(f"Existing keys in target bucket: {existing_keys}")
    
    # List all objects in the source bucket
    response = s3.list_objects_v2(Bucket=source_bucket_name)
    
    if 'Contents' not in response:
        print("No files found in the source bucket.")
        return {
            'statusCode': 200,
            'body': json.dumps('No files found in the source bucket.')
        }
    
    for obj in response.get('Contents', []):
        key = obj['Key']
        print(f"Found file in source bucket: {key}")
        # Process only files that match the specified pattern and do not exist in the target bucket
        if key.endswith('predictions4days.json'):
            # Extract date and time from the file name
            date_time = key.split(' - ')[0]
            output_file_key = f'{date_time} - predictions4days.csv'
            if output_file_key not in existing_keys:
                print(f"Processing file: {key}")
                process_file(source_bucket_name, target_bucket_name, key, transform_predictions_4d, output_file_key)
            else:
                print(f"Skipping already existing file: {output_file_key}")

    return {
        'statusCode': 200,
        'body': json.dumps('CSV file created successfully for predictions4days.json')
    }

def get_existing_keys(bucket_name):
    existing_keys = set()
    continuation_token = None
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in response:
            for obj in response['Contents']:
                existing_keys.add(obj['Key'])
        if response.get('IsTruncated'):
            continuation_token = response.get('NextContinuationToken')
        else:
            break
    return existing_keys

def process_file(source_bucket_name, target_bucket_name, key, transform_func, output_file_key):
    try:
        # Get the file from the source S3 bucket
        print(f"Downloading file: {key} from bucket: {source_bucket_name}")
        obj = s3.get_object(Bucket=source_bucket_name, Key=key)
        raw_data = obj['Body'].read().decode('utf-8').splitlines()
        
        # Apply the transformation function
        extracted_df = transform_func(raw_data)
        print(f"DataFrame created with {len(extracted_df)} records")
        
        # Convert DataFrame to CSV
        csv_buffer = StringIO()
        extracted_df.to_csv(csv_buffer, index=False)
        
        # Upload the CSV to the target S3 bucket
        print(f"Uploading transformed CSV to bucket: {target_bucket_name} with key: {output_file_key}")
        s3.put_object(Bucket=target_bucket_name, Key=output_file_key, Body=csv_buffer.getvalue())
        print(f"File {output_file_key} uploaded to {target_bucket_name}")
    
    except Exception as e:
        print(f"Error processing file {key}: {e}")

def transform_predictions_4d(raw_data):
    data = json.loads('\n'.join(raw_data))  # Combine lines and load JSON
    extracted_data = []

    def safe_get(d, keys, default=None):
        for key in keys:
            try:
                d = d[key]
            except (KeyError, IndexError, TypeError):
                return default
        return d

    def kelvin_to_celsius(kelvin):
        return kelvin - 273.15

    for city_data in data:
        city_info = city_data['city']
        city_id = city_info['id']
        city_name = city_info['name']
        coord_lon = city_info['coord']['lon']
        coord_lat = city_info['coord']['lat']
        country = city_info['country']
        population = city_info['population']
        timezone = city_info['timezone']
        
        weather_list = city_data['list']
        
        for weather in weather_list:
            dt = safe_get(weather, ['dt'])
            temp_day_kelvin = safe_get(weather, ['main', 'temp'])
            pressure = safe_get(weather, ['main', 'pressure'])
            humidity = safe_get(weather, ['main', 'humidity'])
            weather_description = safe_get(weather, ['weather', 0, 'description'])
            wind_speed = safe_get(weather, ['wind', 'speed'])
            wind_deg = safe_get(weather, ['wind', 'deg'])
            rain = safe_get(weather, ['rain', '1h'], 0)

            # Convert temperatures from Kelvin to Celsius
            temp_day_celsius = kelvin_to_celsius(temp_day_kelvin) if temp_day_kelvin is not None else None
            
            # Convert Unix timestamp to human-readable format
            if dt is not None:
                dt_obj = datetime.utcfromtimestamp(dt)
                date_str = dt_obj.date().isoformat()
                time_str = dt_obj.time().isoformat()
            else:
                date_str = time_str = None
            
            extracted_data.append({
                'city_id': city_id,
                'city_name': city_name,
                'coord_lon': coord_lon,
                'coord_lat': coord_lat,
                'country': country,
                'population': population,
                'timezone': timezone,
                'date': date_str,
                'time': time_str,
                'temp_day': temp_day_celsius,
                'pressure': pressure,
                'humidity': humidity,
                'weather_description': weather_description,
                'wind_speed': wind_speed,
                'wind_deg': wind_deg,
                'rain': rain
            })

    return pd.DataFrame(extracted_data)


# myDataFetching4: HISTORIC_WEATHER.JSON into CSV

In [None]:
import json
import pandas as pd
import boto3
from io import StringIO
import time
from datetime import datetime  # Importing datetime module

# Initialize the S3 client
s3 = boto3.client('s3')

def lambda_handler(event, context):
    start_time = time.time()
    source_bucket_name = 'dwlprojectbucket'
    target_bucket_name = 'datalakepartition2'
    target_file_key = 'historic_weather.csv'  # Output file name in the target bucket
    
    file_to_process = 'historic_weather.json'
    
    try:
        # Pagination setup
        continuation_token = None
        file_found = False
        
        while True:
            # List objects in the source bucket with pagination handling
            if continuation_token:
                response = s3.list_objects_v2(Bucket=source_bucket_name, ContinuationToken=continuation_token)
            else:
                response = s3.list_objects_v2(Bucket=source_bucket_name)

            print(f"Time to list objects in source bucket: {time.time() - start_time} seconds")

            if 'Contents' not in response:
                print("No files found in source bucket.")
                break

            # Print all the keys found in the bucket
            print("Listing all files in the source bucket:")
            for obj in response.get('Contents', []):
                print(f"Found file: {obj['Key']}")
                
                # Check if the specific file exists
                if obj['Key'] == file_to_process:
                    file_found = True
                    print(f"Found file to process: {obj['Key']}")
                    
                    # Process the file
                    process_file(source_bucket_name, target_bucket_name, obj['Key'], target_file_key)
                    break

            if file_found:
                break  # Exit the loop if file is found

            # Check if there are more pages of results
            if response.get('IsTruncated'):
                continuation_token = response.get('NextContinuationToken')
            else:
                break  # No more pages

        if not file_found:
            print(f"File {file_to_process} not found in source bucket.")
            return {
                'statusCode': 200,
                'body': json.dumps(f'File {file_to_process} not found in source bucket.')
            }

    except Exception as e:
        print(f"Error during processing: {e}")
        return {
            'statusCode': 500,
            'body': json.dumps(f"Error during processing: {e}")
        }
    
    print(f"Total execution time: {time.time() - start_time} seconds")
    return {
        'statusCode': 200,
        'body': json.dumps(f'CSV file created successfully for {file_to_process}')
    }

def process_file(source_bucket_name, target_bucket_name, key, output_file_key):
    try:
        # Get the file from the source S3 bucket
        print(f"Fetching file {key} from bucket {source_bucket_name}...")
        obj = s3.get_object(Bucket=source_bucket_name, Key=key)

        raw_data = obj['Body'].read().decode('utf-8')
        print(f"Fetched and read file {key}")

        # Transform the JSON data to a DataFrame
        data = json.loads(raw_data)
        extracted_df = transform_historic_weather(data)
        print(f"Transformed data to DataFrame with shape {extracted_df.shape}")

        # Convert DataFrame to CSV
        csv_buffer = StringIO()
        extracted_df.to_csv(csv_buffer, index=False)
        print(f"Converted DataFrame to CSV")

        # Upload the CSV to the target S3 bucket
        s3.put_object(Bucket=target_bucket_name, Key=output_file_key, Body=csv_buffer.getvalue())
        print(f"Uploaded CSV to bucket {target_bucket_name} with key {output_file_key}")

    except Exception as e:
        print(f"Error processing file {key}: {e}")
        raise e

def transform_historic_weather(data):
    extracted_data = []

    def safe_get(d, keys, default=None):
        for key in keys:
            try:
                d = d[key]
            except (KeyError, IndexError, TypeError):
                return default
        return d

    def kelvin_to_celsius(kelvin):
        return kelvin - 273.15

    for city_data in data:
        city_id = city_data.get('city_id')
        weather_list = city_data.get('list', [])
        
        for weather in weather_list:
            dt = safe_get(weather, ['dt'])
            main = safe_get(weather, ['main'], {})
            wind = safe_get(weather, ['wind'], {})
            clouds = safe_get(weather, ['clouds', 'all'], None)
            weather_description = safe_get(weather, ['weather', 0, 'description'], None)
            
            temp_celsius = kelvin_to_celsius(main.get('temp')) if main.get('temp') is not None else None
            
            if dt is not None:
                dt_obj = datetime.utcfromtimestamp(dt)  # Use the imported datetime module
                date_str = dt_obj.date().isoformat()
                time_str = dt_obj.time().isoformat()
                datetime_str = dt_obj.isoformat()
            else:
                date_str = time_str = datetime_str = None
            
            extracted_data.append({
                'city_id': city_id,
                'datetime': datetime_str,
                'date': date_str,
                'time': time_str,
                'temp': temp_celsius,
                'pressure': main.get('pressure'),
                'humidity': main.get('humidity'),
                'wind_speed': wind.get('speed'),
                'clouds': clouds,
                'weather_description': weather_description
            })

    return pd.DataFrame(extracted_data)


# myDataFetching5: FIXTURES.JSON into CSV

In [None]:
import json
import pandas as pd
import boto3
from io import StringIO
from datetime import datetime
import time

s3 = boto3.client('s3')

def lambda_handler(event, context):
    start_time = time.time()
    source_bucket_name = 'dwlprojectbucket'
    target_bucket_name = 'datalakepartition2'
    
    # Only process fixtures.json files
    file_transformations = {
        'fixtures.json': transform_fixtures,
    }
    
    # Get the list of existing keys in the target bucket
    existing_keys = get_existing_keys(target_bucket_name)
    print(f"Time to fetch existing keys: {time.time() - start_time} seconds")
    print(f"Existing keys in target bucket: {existing_keys}")
    
    # List all objects in the source bucket
    response = s3.list_objects_v2(Bucket=source_bucket_name)
    print(f"Time to list objects in source bucket: {time.time() - start_time} seconds")
    if 'Contents' not in response:
        print("No files found in source bucket.")
        return {
            'statusCode': 200,
            'body': json.dumps('No files found in source bucket.')
        }
    
    for obj in response.get('Contents', []):
        key = obj['Key']
        # Process only files that match the specified pattern and do not exist in the target bucket
        for pattern, transform_func in file_transformations.items():
            if key.endswith(pattern):
                # Extract the date and time from the filename and ensure it is formatted correctly
                date_time = key.split(' - ')[0].strip()  # Adjust this split based on your actual filename pattern
                output_file_key = f'{date_time} - {pattern.split(".")[0]}.csv'
                if output_file_key not in existing_keys:
                    print(f"Processing file: {key}")
                    process_file(source_bucket_name, target_bucket_name, key, transform_func, output_file_key)

    print(f"Total execution time: {time.time() - start_time} seconds")
    return {
        'statusCode': 200,
        'body': json.dumps('CSV files created successfully')
    }

def get_existing_keys(bucket_name):
    existing_keys = set()
    continuation_token = None
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in response:
            for obj in response['Contents']:
                existing_keys.add(obj['Key'])
        if response.get('IsTruncated'):
            continuation_token = response.get('NextContinuationToken')
        else:
            break
    return existing_keys

def process_file(source_bucket_name, target_bucket_name, key, transform_func, output_file_key):
    try:
        # Get the file from the source S3 bucket
        fetch_start_time = time.time()
        obj = s3.get_object(Bucket=source_bucket_name, Key=key)
        print(f"Time to fetch file {key}: {time.time() - fetch_start_time} seconds")

        read_start_time = time.time()
        raw_data = obj['Body'].read().decode('utf-8').splitlines()
        print(f"Time to read and decode file {key}: {time.time() - read_start_time} seconds")
        
        # Apply the transformation function
        transform_start_time = time.time()
        extracted_df = transform_func(raw_data)
        print(f"Time to transform file {key}: {time.time() - transform_start_time} seconds")
        
        # Convert DataFrame to CSV
        csv_buffer = StringIO()
        csv_start_time = time.time()
        extracted_df.to_csv(csv_buffer, index=False)
        print(f"Time to convert DataFrame to CSV for file {key}: {time.time() - csv_start_time} seconds")
        
        # Upload the CSV to the target S3 bucket
        upload_start_time = time.time()
        s3.put_object(Bucket=target_bucket_name, Key=output_file_key, Body=csv_buffer.getvalue())
        print(f"Time to upload CSV for file {key}: {time.time() - upload_start_time} seconds")
    
    except Exception as e:
        print(f"Error processing file {key}: {e}")

# Transformation function for fixtures.json

def transform_fixtures(raw_data):
    data = json.loads('\n'.join(raw_data))  # Combine lines and load JSON
    extracted_data = []

    def safe_get(d, keys, default=None):
        for key in keys:
            try:
                d = d[key]
            except (KeyError, IndexError, TypeError):
                return default
        return d

    def split_date_time(datetime_str):
        dt_obj = datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
        date_str = dt_obj.date().isoformat()
        time_str = dt_obj.time().isoformat()
        return date_str, time_str

    for fixture_data in data:
        fixture = fixture_data['fixture']
        venue = fixture['venue']
        teams = fixture_data['teams']
        goals = fixture_data['goals']
        
        fixture_id = fixture['id']
        referee = fixture['referee']
        datetime_str = fixture['date']
        timestamp = fixture['timestamp']
        status_short = fixture['status']['short']
        venue_id = venue['id']
        venue_name = venue['name']
        venue_city = venue['city']
        
        home_team_id = teams['home']['id']
        home_team_name = teams['home']['name']
        home_winner = teams['home']['winner']
        home_goals = goals['home']
        
        away_team_id = teams['away']['id']
        away_team_name = teams['away']['name']
        away_winner = teams['away']['winner']
        away_goals = goals['away']
        
        date, time = split_date_time(datetime_str)
        
        extracted_data.append({
            'fixture_id': fixture_id,
            'referee': referee,
            'date': date,
            'time': time,
            'timestamp': timestamp,
            'status_short': status_short,
            'venue_id': venue_id,
            'venue_name': venue_name,
            'venue_city': venue_city,
            'home_team_id': home_team_id,
            'home_team_name': home_team_name,
            'home_winner': home_winner,
            'home_goals': home_goals,
            'away_team_id': away_team_id,
            'away_team_name': away_team_name,
            'away_winner': away_winner,
            'away_goals': away_goals
        })

    return pd.DataFrame(extracted_data)


# myDataFetching6: STATISTICS_FIXTURES_ALL.JSON into CSV

In [None]:
import json
import pandas as pd
import boto3
from io import StringIO
from datetime import datetime
import time

s3 = boto3.client('s3')

def lambda_handler(event, context):
    start_time = time.time()
    source_bucket_name = 'dwlprojectbucket'
    target_bucket_name = 'datalakepartition2'
    
    # Only process statistics_fixtures_all.json files
    pattern = 'statistics_fixtures_all.json'
    
    # Get the list of existing keys in the target bucket
    existing_keys = get_existing_keys(target_bucket_name)
    print(f"Time to fetch existing keys: {time.time() - start_time} seconds")
    print(f"Existing keys in target bucket: {existing_keys}")
    
    # List all objects in the source bucket
    response = s3.list_objects_v2(Bucket=source_bucket_name)
    print(f"Time to list objects in source bucket: {time.time() - start_time} seconds")
    if 'Contents' not in response:
        print("No files found in source bucket.")
        return {
            'statusCode': 200,
            'body': json.dumps('No files found in source bucket.')
        }
    
    for obj in response.get('Contents', []):
        key = obj['Key']
        # Process only files that match the specified pattern and do not exist in the target bucket
        if key.endswith(pattern):
            # Extract the date and time from the filename and ensure it is formatted correctly
            date_time = key.split(' - ')[0].strip()  # Adjust this split based on your actual filename pattern
            output_file_key = f'{date_time} - {pattern.split(".")[0]}.csv'
            if output_file_key not in existing_keys:
                print(f"Processing file: {key}")
                process_file(source_bucket_name, target_bucket_name, key, transform_statistics_fixtures, output_file_key)

    print(f"Total execution time: {time.time() - start_time} seconds")
    return {
        'statusCode': 200,
        'body': json.dumps('CSV files created successfully')
    }

def get_existing_keys(bucket_name):
    existing_keys = set()
    continuation_token = None
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in response:
            for obj in response['Contents']:
                existing_keys.add(obj['Key'])
        if response.get('IsTruncated'):
            continuation_token = response.get('NextContinuationToken')
        else:
            break
    return existing_keys

def process_file(source_bucket_name, target_bucket_name, key, transform_func, output_file_key):
    try:
        # Get the file from the source S3 bucket
        fetch_start_time = time.time()
        obj = s3.get_object(Bucket=source_bucket_name, Key=key)
        print(f"Time to fetch file {key}: {time.time() - fetch_start_time} seconds")

        read_start_time = time.time()
        raw_data = obj['Body'].read().decode('utf-8').splitlines()
        print(f"Time to read and decode file {key}: {time.time() - read_start_time} seconds")
        
        # Apply the transformation function
        transform_start_time = time.time()
        extracted_df = transform_func(raw_data)
        print(f"Time to transform file {key}: {time.time() - transform_start_time} seconds")
        
        # Convert DataFrame to CSV
        csv_buffer = StringIO()
        csv_start_time = time.time()
        extracted_df.to_csv(csv_buffer, index=False)
        print(f"Time to convert DataFrame to CSV for file {key}: {time.time() - csv_start_time} seconds")
        
        # Upload the CSV to the target S3 bucket
        upload_start_time = time.time()
        s3.put_object(Bucket=target_bucket_name, Key=output_file_key, Body=csv_buffer.getvalue())
        print(f"Time to upload CSV for file {key}: {time.time() - upload_start_time} seconds")
    
    except Exception as e:
        print(f"Error processing file {key}: {e}")

# Transformation function for statistics_fixtures_all.json

def transform_statistics_fixtures(raw_data):
    extracted_data = []
    for line in raw_data:
        try:
            outer_list = json.loads(line)
            for json_str in outer_list:
                fixture = json.loads(json_str)
                fixture_id = fixture['parameters']['fixture']
                for fixture_data in fixture['response']:
                    team = fixture_data['team']
                    statistics = fixture_data['statistics']
                    
                    team_id = team['id']
                    team_name = team['name']
                    
                    stats_dict = {stat['type']: stat['value'] for stat in statistics}
                    
                    extracted_data.append({
                        'fixtureID': fixture_id,
                        'team_Id': team_id,
                        'team_name': team_name,
                        'Shots on Goal': stats_dict.get('Shots on Goal'),
                        'Shots off Goal': stats_dict.get('Shots off Goal'),
                        'Total Shots': stats_dict.get('Total Shots'),
                        'Blocked Shots': stats_dict.get('Blocked Shots'),
                        'Shots insidebox': stats_dict.get('Shots insidebox'),
                        'Shots outsidebox': stats_dict.get('Shots outsidebox'),
                        'Fouls': stats_dict.get('Fouls'),
                        'Corner Kicks': stats_dict.get('Corner Kicks'),
                        'Offsides': stats_dict.get('Offsides'),
                        'Ball Possession': stats_dict.get('Ball Possession'),
                        'Yellow Cards': stats_dict.get('Yellow Cards'),
                        'Red Cards': stats_dict.get('Red Cards'),
                        'Goalkeeper Saves': stats_dict.get('Goalkeeper Saves'),
                        'Total passes': stats_dict.get('Total passes'),
                        'Passes accurate': stats_dict.get('Passes accurate'),
                        'Passes %': stats_dict.get('Passes %'),
                        'expected_goals': stats_dict.get('expected_goals')
                    })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue

    return pd.DataFrame(extracted_data)


# 