# _**First step: downloading necessary initiators to work in notebook and setting up environment**_

In [1]:
%pip install azure-storage-blob

Note: you may need to restart the kernel to use updated packages.


In [2]:
!which python

/anaconda/envs/azureml_py38/bin//python


In [3]:
!pip show python-dotenv

Name: python-dotenv
Version: 1.0.1
Summary: Read key-value pairs from a .env file and set them as environment variables
Home-page: https://github.com/theskumar/python-dotenv
Author: Saurabh Kumar
Author-email: me+github@saurabh-kumar.com
License: BSD-3-Clause
Location: /anaconda/envs/azureml_py38/lib/python3.9/site-packages
Requires: 
Required-by: 


In [4]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io 
from dotenv import load_dotenv
import os

load_dotenv()

account_name = os.getenv("account_name")
account_key = os.getenv("account_key")
container_name = os.getenv("container_name")
blob_name = "amazon.csv"

blob_service_client = BlobServiceClient(account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key)
print(blob_service_client)
container_client = blob_service_client.get_container_client(container_name)
print(container_client)
blob_client = container_client.get_blob_client(blob_name)
print(blob_client.blob_name)

try:
    blob_client.get_blob_properties()
    print("Blob exists!")
except Exception as e:
    print(f"Blob not found: {e}")

<azure.storage.blob._blob_service_client.BlobServiceClient object at 0x7f2304d0f4f0>
<azure.storage.blob._container_client.ContainerClient object at 0x7f22e66ed810>
amazon.csv
Blob exists!


In [5]:
download_stream = blob_client.download_blob()
df = pd.read_csv(io.StringIO(download_stream.content_as_text()))

print(df.head())

   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%  

# **Explanatory Data analysis: we will explore data and see what can be used what features can engineered.**

In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

### Since data is not contain null values we do not need to handle null values.

In [7]:
print(df['category'].head())
print(df['discounted_price'].head())
print(df['actual_price'].head())
print(df['discount_percentage'].head())
print(df['rating'].head())

0    Computers&Accessories|Accessories&Peripherals|...
1    Computers&Accessories|Accessories&Peripherals|...
2    Computers&Accessories|Accessories&Peripherals|...
3    Computers&Accessories|Accessories&Peripherals|...
4    Computers&Accessories|Accessories&Peripherals|...
Name: category, dtype: object
0    ₹399
1    ₹199
2    ₹199
3    ₹329
4    ₹154
Name: discounted_price, dtype: object
0    ₹1,099
1      ₹349
2    ₹1,899
3      ₹699
4      ₹399
Name: actual_price, dtype: object
0    64%
1    43%
2    90%
3    53%
4    61%
Name: discount_percentage, dtype: object
0    4.2
1    4.0
2    3.9
3    4.2
4    4.2
Name: rating, dtype: object


### We need to analyze the category names of the products and possible make it more shorter 