# _**First step: downloading necessary initiators to work in notebook and setting up environment**_

In [1]:
%pip install azure-storage-blob

Note: you may need to restart the kernel to use updated packages.


In [2]:
!which python

/anaconda/envs/azureml_py38/bin//python


In [3]:
!pip show python-dotenv

Name: python-dotenv
Version: 1.0.1
Summary: Read key-value pairs from a .env file and set them as environment variables
Home-page: https://github.com/theskumar/python-dotenv
Author: Saurabh Kumar
Author-email: me+github@saurabh-kumar.com
License: BSD-3-Clause
Location: /anaconda/envs/azureml_py38/lib/python3.9/site-packages
Requires: 
Required-by: 


In [4]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import numpy as np
import io 
from dotenv import load_dotenv
import os

load_dotenv()

account_name = os.getenv("account_name")
account_key = os.getenv("account_key")
container_name = os.getenv("container_name")
blob_name = "amazon.csv"

print(f"account name is: {account_name}")
print(f"container name is: {container_name}")
print(f"blob name: {blob_name}")

blob_service_client = BlobServiceClient(account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key)
container_client = blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(blob_name)

try:
    blob_client.get_blob_properties()
    print("Blob exists!")
except Exception as e:
    print(f"Blob not found: {e}")

account name is: productpredict6186226399
container name is: azureml-blobstore-988bc6ed-626e-41f2-8020-27b01da8ac63
blob name: amazon.csv
Blob exists!


In [5]:
download_stream = blob_client.download_blob()
df = pd.read_csv(io.StringIO(download_stream.content_as_text()))

print(df.head())

   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%  

# **Explanatory Data analysis: we will explore data and see what can be used what features can engineered.**

In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

### Since data do not contain null values we do not need to handle null values.

In [7]:
print(df['product_id'].head())
print(df['product_name'].head())
print(df['category'].head())
print(df['discounted_price'].head())
print(df['actual_price'].head())
print(df['discount_percentage'].head())
print(df['rating'].head())
print(df['rating_count'].head())
print(df['about_product'].head())
print(df['user_id'].head())
print(df['user_name'].head())
print(df['review_id'].head())
print(df['review_title'].head())
print(df['review_content'].head())
print(df['img_link'].head())
print(df['product_link'].head())

0    B07JW9H4J1
1    B098NS6PVG
2    B096MSW6CT
3    B08HDJ86NZ
4    B08CF3B7N1
Name: product_id, dtype: object
0    Wayona Nylon Braided USB to Lightning Fast Cha...
1    Ambrane Unbreakable 60W / 3A Fast Charging 1.5...
2    Sounce Fast Phone Charging Cable & Data Sync U...
3    boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...
4    Portronics Konnect L 1.2M Fast Charging 3A 8 P...
Name: product_name, dtype: object
0    Computers&Accessories|Accessories&Peripherals|...
1    Computers&Accessories|Accessories&Peripherals|...
2    Computers&Accessories|Accessories&Peripherals|...
3    Computers&Accessories|Accessories&Peripherals|...
4    Computers&Accessories|Accessories&Peripherals|...
Name: category, dtype: object
0    ₹399
1    ₹199
2    ₹199
3    ₹329
4    ₹154
Name: discounted_price, dtype: object
0    ₹1,099
1      ₹349
2    ₹1,899
3      ₹699
4      ₹399
Name: actual_price, dtype: object
0    64%
1    43%
2    90%
3    53%
4    61%
Name: discount_percentage, dtype: object
0    4

### It seems like we need to drop lots of columns but before procceeding
we need to do some analysis

    1. lets find how they correlate to each other 
    2. to check how they can be useful
    3. it seems like customer related fields, review related and, links are unnecesary in my project so I will drop them 

In [8]:
df = df.drop(['about_product', 'user_id', 'user_name', 'review_id', 'review_title', 'review_content', 'img_link', 'product_link'], axis=1)

In [9]:
print(df['category'].unique())

['Computers&Accessories|Accessories&Peripherals|Cables&Accessories|Cables|USBCables'
 'Computers&Accessories|NetworkingDevices|NetworkAdapters|WirelessUSBAdapters'
 'Electronics|HomeTheater,TV&Video|Accessories|Cables|HDMICables'
 'Electronics|HomeTheater,TV&Video|Televisions|SmartTelevisions'
 'Electronics|HomeTheater,TV&Video|Accessories|RemoteControls'
 'Electronics|HomeTheater,TV&Video|Televisions|StandardTelevisions'
 'Electronics|HomeTheater,TV&Video|Accessories|TVMounts,Stands&Turntables|TVWall&CeilingMounts'
 'Electronics|HomeTheater,TV&Video|Accessories|Cables|RCACables'
 'Electronics|HomeAudio|Accessories|SpeakerAccessories|Mounts'
 'Electronics|HomeTheater,TV&Video|Accessories|Cables|OpticalCables'
 'Electronics|HomeTheater,TV&Video|Projectors'
 'Electronics|HomeAudio|Accessories|Adapters'
 'Electronics|HomeTheater,TV&Video|SatelliteEquipment|SatelliteReceivers'
 'Computers&Accessories|Accessories&Peripherals|Cables&Accessories|Cables|DVICables'
 'Electronics|HomeTheater,TV&

In [10]:
print(df['product_name'].value_counts())

product_name
Fire-Boltt Ninja Call Pro Plus 1.83" Smart Watch with Bluetooth Calling, AI Voice Assistance, 100 Sports Modes IP67 Rating, 240*280 Pixel High Resolution                           5
Fire-Boltt Phoenix Smart Watch with Bluetooth Calling 1.3",120+ Sports Modes, 240*240 PX High Res with SpO2, Heart Rate Monitoring & IP67 Rating                                    4
Wayona Nylon Braided USB to Lightning Fast Charging and Data Sync Cable Compatible for iPhone 13, 12,11, X, 8, 7, 6, 5, iPad Air, Pro, Mini (3 FT Pack of 1, Grey)                  3
MI Braided USB Type-C Cable for Charging Adapter (Red)                                                                                                                              3
Amazonbasics Nylon Braided Usb-C To Lightning Cable, Fast Charging Mfi Certified Smartphone, Iphone Charger (6-Foot, Dark Grey)                                                     3
                                                                             

#### Lets convert the prices into numeric format and then convert prices to US Dollars.

In [11]:
df['actual_price'] = df['actual_price'].str.replace(",","")
df['actual_price'] = df['actual_price'].str.replace("₹","").astype(float)
print(df['actual_price'].head())

0    1099.0
1     349.0
2    1899.0
3     699.0
4     399.0
Name: actual_price, dtype: float64


In [12]:
df['discounted_price'] = df['discounted_price'].str.replace(",","")
df['discounted_price'] = df['discounted_price'].str.replace("₹","").astype(float)
print(df['discounted_price'].head())

0    399.0
1    199.0
2    199.0
3    329.0
4    154.0
Name: discounted_price, dtype: float64


#### Once you made an operation data is changed and because of that when you run the cell again you get an error in this data preparation steps.

In [13]:
print(df['actual_price'].head())
print(df['discounted_price'].head())

0    1099.0
1     349.0
2    1899.0
3     699.0
4     399.0
Name: actual_price, dtype: float64
0    399.0
1    199.0
2    199.0
3    329.0
4    154.0
Name: discounted_price, dtype: float64


#### 1 US dollar is equivalent to 83.88 Indian rupees which I will take it as 84 to make it more computable

In [14]:
df['actual_price'] = df['actual_price'] / 84
df['discounted_price'] = df['discounted_price'] / 84

In [15]:
df['actual_price'] = (df['actual_price']).round(4)
df['discounted_price'] = (df['discounted_price']).round(4)
print(df['actual_price'].head())
print(df['discounted_price'].head())

0    13.0833
1     4.1548
2    22.6071
3     8.3214
4     4.7500
Name: actual_price, dtype: float64
0    4.7500
1    2.3690
2    2.3690
3    3.9167
4    1.8333
Name: discounted_price, dtype: float64


### Now lets clean the product name part so that only brands are taken and other unnecessary writings are not taken

In [16]:
df['product_name'] = df["product_name"].apply(lambda x: x.split(" ")[0])
print(df['product_name'].head())

0        Wayona
1       Ambrane
2        Sounce
3          boAt
4    Portronics
Name: product_name, dtype: object


## It seems like taking first and last part of the category variable is sufficient

In [17]:
category_top = df['category'].apply(lambda x: x.split("|")[0])
category_bottom = df['category'].apply(lambda x: x.split("|")[-1])
df['category_general'] = category_top
df['category_detailed'] = category_bottom
print(df.head())

   product_id product_name                                           category  \
0  B07JW9H4J1       Wayona  Computers&Accessories|Accessories&Peripherals|...   
1  B098NS6PVG      Ambrane  Computers&Accessories|Accessories&Peripherals|...   
2  B096MSW6CT       Sounce  Computers&Accessories|Accessories&Peripherals|...   
3  B08HDJ86NZ         boAt  Computers&Accessories|Accessories&Peripherals|...   
4  B08CF3B7N1   Portronics  Computers&Accessories|Accessories&Peripherals|...   

   discounted_price  actual_price discount_percentage rating rating_count  \
0            4.7500       13.0833                 64%    4.2       24,269   
1            2.3690        4.1548                 43%    4.0       43,994   
2            2.3690       22.6071                 90%    3.9        7,928   
3            3.9167        8.3214                 53%    4.2       94,363   
4            1.8333        4.7500                 61%    4.2       16,905   

        category_general category_detailed  
0  Co

#### I think product Id and Category is not needed anymore in our data

In [18]:
df = df.drop(['product_id', 'category'], axis=1)
df.head()

Unnamed: 0,product_name,discounted_price,actual_price,discount_percentage,rating,rating_count,category_general,category_detailed
0,Wayona,4.75,13.0833,64%,4.2,24269,Computers&Accessories,USBCables
1,Ambrane,2.369,4.1548,43%,4.0,43994,Computers&Accessories,USBCables
2,Sounce,2.369,22.6071,90%,3.9,7928,Computers&Accessories,USBCables
3,boAt,3.9167,8.3214,53%,4.2,94363,Computers&Accessories,USBCables
4,Portronics,1.8333,4.75,61%,4.2,16905,Computers&Accessories,USBCables


#### Now for the final step of data preparation I will create new features with the help of rating and rating count columns.

In [20]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['rating'] = df['rating'].fillna(0)
df['rating_count'] = df['rating_count'].fillna("0")
df['rating_count'] = df['rating_count'].str.replace(",", "")
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce').fillna(0).astype(int)
print(df['rating'].head())
print(df['rating_count'].head())

0    4.2
1    4.0
2    3.9
3    4.2
4    4.2
Name: rating, dtype: float64
0    24269
1    43994
2     7928
3    94363
4    16905
Name: rating_count, dtype: int64


In [21]:
df['weighted_average_rating'] = (df['rating'] * df['rating_count'])/(df['rating_count'] + 1)
df['rating_to_rev_ratio'] = df['rating'] / (df['rating_count'] + 1)
df['log_rev_count'] = np.log1p(df['rating_count'])

print(df['weighted_average_rating'].head())
print(df['rating_to_rev_ratio'].head())
print(df['log_rev_count'].head())

0    4.199827
1    3.999909
2    3.899508
3    4.199955
4    4.199752
Name: weighted_average_rating, dtype: float64
0    0.000173
1    0.000091
2    0.000492
3    0.000045
4    0.000248
Name: rating_to_rev_ratio, dtype: float64
0    10.096996
1    10.691831
2     8.978282
3    11.454915
4     9.735424
Name: log_rev_count, dtype: float64
