In [10]:

import pandas as pd
import numpy as np
import boto3
import os
import time
import random
from datetime import datetime

# Project paths
base_path = r"C:\Users\ACER\Desktop\Amazon_Project"
data_path = os.path.join(base_path, 'data', 'amazon_cleaned.csv')
screenshots_path = os.path.join(base_path, 'screenshots')

print(f"‚úÖ Project path: {base_path}")
print(f"‚úÖ Data file: {data_path}")
print(f"‚úÖ Screenshots: {screenshots_path}")

# Load cleaned data from Day 1
df = pd.read_csv(data_path)
print(f"\n‚úÖ Loaded cleaned data: {len(df):,} rows")
print(df.head(2))

‚úÖ Project path: C:\Users\ACER\Desktop\Amazon_Project
‚úÖ Data file: C:\Users\ACER\Desktop\Amazon_Project\data\amazon_cleaned.csv
‚úÖ Screenshots: C:\Users\ACER\Desktop\Amazon_Project\screenshots

‚úÖ Loaded cleaned data: 1,465 rows
   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ‚Çπ399   
1  Computers&Accessories|Accessories&Peripherals|...             ‚Çπ199   

   actual_price  discount_percentage  rating  rating_count  \
0        1099.0                   64     4.2       24269.0   
1         349.0                   43     4.0       43994.0   

                                       about_product  \
0  High Compatibility : Compatible With iPhone 12...   
1  Compatible with all Type C enabled 

In [11]:

#  Check AWS configuration


print("üîç Checking AWS Configuration...")
print("="*50)

try:
    # Test AWS connection
    sts = boto3.client('sts')
    identity = sts.get_caller_identity()
    print(f"‚úÖ AWS Connected Successfully!")
    print(f"   Account: {identity['Account']}")
    print(f"   User ARN: {identity['Arn']}")
    
    # Check region
    session = boto3.session.Session()
    region = session.region_name
    print(f"   Region: {region}")
    
except Exception as e:
    print("‚ùå AWS Not Configured!")
    print("\nüîß Run this in terminal:")
    print("   aws configure")
    print("\n   Enter your AWS Access Key ID")
    print("   Enter your AWS Secret Access Key")
    print("   Default region: ap-south-1")
    print("   Output format: json")

üîç Checking AWS Configuration...
‚úÖ AWS Connected Successfully!
   Account: 218504945626
   User ARN: arn:aws:iam::218504945626:user/data_analyst
   Region: ap-south-1


In [12]:
# Day 2 notebook mein yeh line yaad rakhna:
df = pd.read_csv(r"C:\Users\ACER\Desktop\Amazon_Project\data\amazon_cleaned.csv")

In [13]:


# CELL 1: Imports and Setup
import pandas as pd
import numpy as np
import boto3
import os
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Paths
base_path = r"C:\Users\ACER\Desktop\Amazon_Project"
data_path = os.path.join(base_path, 'data', 'amazon_cleaned.csv')
screenshots_path = os.path.join(base_path, 'screenshots')

# Create screenshots folder if not exists
os.makedirs(screenshots_path, exist_ok=True)

print("‚úÖ Setup complete")

# CELL 2: Load Cleaned Data
df = pd.read_csv(data_path)
print(f"‚úÖ Loaded {len(df):,} products")
print(f"Columns: {df.columns.tolist()}")

# CELL 3: AWS Check
try:
    s3 = boto3.client('s3')
    print("‚úÖ AWS configured")
except:
    print("‚ùå AWS not configured")



‚úÖ Setup complete
‚úÖ Loaded 1,465 products
Columns: ['product_id', 'product_name', 'category', 'discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count', 'about_product', 'user_id', 'user_name', 'review_id', 'review_title', 'review_content', 'img_link', 'product_link', 'original_price', 'estimated_cost', 'profit_per_unit', 'margin_percent', 'estimated_revenue', 'estimated_profit']
‚úÖ AWS configured


In [14]:

import boto3

sts = boto3.client('sts')
identity = sts.get_caller_identity()
print(f"‚úÖ Connected! Account: {identity['Account']}")

‚úÖ Connected! Account: 218504945626


In [15]:

# Create S3 bucket


import boto3
import random

s3 = boto3.client('s3')


bucket_name = f"amazon-profit-{random.randint(1000,9999)}"

try:
    # Create bucket (for ap-south-1 region)
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={'LocationConstraint': 'ap-south-1'}
    )
    print(f"‚úÖ Bucket created: {bucket_name}")
    
    # Save bucket name for later use
    with open('bucket_name.txt', 'w') as f:
        f.write(bucket_name)
    
except Exception as e:
    print(f"‚ùå Error creating bucket: {e}")

‚úÖ Bucket created: amazon-profit-6663


In [16]:

# : Upload cleaned data to S3

import os

# Get bucket name
with open('bucket_name.txt', 'r') as f:
    bucket_name = f.read().strip()

# Upload cleaned data
try:
    s3.upload_file(
        os.path.join(base_path, 'data', 'amazon_cleaned.csv'),
        bucket_name,
        'amazon_cleaned.csv'
    )
    print(f"‚úÖ Uploaded: amazon_cleaned.csv to s3://{bucket_name}/")
    
    # Upload sample as well
    s3.upload_file(
        os.path.join(base_path, 'data', 'amazon_sample.csv'),
        bucket_name,
        'amazon_sample.csv'
    )
    print(f"‚úÖ Uploaded: amazon_sample.csv to s3://{bucket_name}/")
    
    # Verify upload
    response = s3.list_objects_v2(Bucket=bucket_name)
    print(f"\nüìÇ Files in bucket:")
    for obj in response.get('Contents', []):
        size_kb = obj['Size'] / 1024
        print(f"   üìÑ {obj['Key']} - {size_kb:.1f} KB")
        
except Exception as e:
    print(f"‚ùå Upload failed: {e}")

‚úÖ Uploaded: amazon_cleaned.csv to s3://amazon-profit-6663/
‚úÖ Uploaded: amazon_sample.csv to s3://amazon-profit-6663/

üìÇ Files in bucket:
   üìÑ amazon_cleaned.csv - 4699.5 KB
   üìÑ amazon_sample.csv - 327.3 KB


In [9]:
# =============================================
# CELL 6: Create Athena table (FIXED)
# =============================================

import time

# Athena client with region specified
athena = boto3.client('athena', region_name='ap-south-1')

# Get bucket name
with open('bucket_name.txt', 'r') as f:
    bucket_name = f.read().strip()

# Create database
create_db_query = "CREATE DATABASE IF NOT EXISTS amazon_analysis"

try:
    response = athena.start_query_execution(
        QueryString=create_db_query,
        ResultConfiguration={'OutputLocation': f's3://{bucket_name}/athena-results/'}
    )
    print("‚úÖ Database created/verified")
    time.sleep(2)
except Exception as e:
    print(f"‚ùå Error creating database: {e}")

# Create table
create_table_query = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS amazon_analysis.products (
    product_id STRING,
    actual_price DOUBLE,
    discount_percentage DOUBLE,
    rating DOUBLE,
    rating_count INT,
    original_price DOUBLE,
    estimated_cost DOUBLE,
    profit_per_unit DOUBLE,
    margin_percent DOUBLE,
    estimated_revenue DOUBLE,
    estimated_profit DOUBLE
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('field.delim' = ',')
LOCATION 's3://{bucket_name}/'
TBLPROPERTIES ('skip.header.line.count' = '1')
"""

try:
    response = athena.start_query_execution(
        QueryString=create_table_query,
        QueryExecutionContext={'Database': 'amazon_analysis'},
        ResultConfiguration={'OutputLocation': f's3://{bucket_name}/athena-results/'}
    )
    print("‚úÖ Table created successfully!")
    
    # Get query execution ID
    query_id = response['QueryExecutionId']
    print(f"   Query ID: {query_id}")
    
except Exception as e:
    print(f"‚ùå Error creating table: {e}")

‚úÖ Database created/verified
‚úÖ Table created successfully!
   Query ID: 42cb07f9-a092-4558-a936-d7fb34cfb805
