# PROCESSING OF DATA

In [50]:
#importing libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Make a request to the website
url = 'https://books.toscrape.com/'
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Create an empty dataframe
columns = ['title', 'price']
df = pd.DataFrame(columns=columns)

# Extract the book information
books = soup.find_all('article', class_='product_pod')
for book in books:
    title = book.h3.a['title']
    price = book.select('.price_color')[0].get_text()
    df = df.append({'title': title, 'price': price}, ignore_index=True)

# Print the dataframe
print(df)


                                                title    price
0                                A Light in the Attic  Â£51.77
1                                  Tipping the Velvet  Â£53.74
2                                          Soumission  Â£50.10
3                                       Sharp Objects  Â£47.82
4               Sapiens: A Brief History of Humankind  Â£54.23
5                                     The Requiem Red  Â£22.65
6   The Dirty Little Secrets of Getting Your Dream...  Â£33.34
7   The Coming Woman: A Novel Based on the Life of...  Â£17.93
8   The Boys in the Boat: Nine Americans and Their...  Â£22.60
9                                     The Black Maria  Â£52.15
10     Starving Hearts (Triangular Trade Trilogy, #1)  Â£13.99
11                              Shakespeare's Sonnets  Â£20.66
12                                        Set Me Free  Â£17.46
13  Scott Pilgrim's Precious Little Life (Scott Pi...  Â£52.29
14                          Rip it Up and Start Again  

# CHECKING DATAFRAME

In [51]:
df.head()

Unnamed: 0,title,price
0,A Light in the Attic,Â£51.77
1,Tipping the Velvet,Â£53.74
2,Soumission,Â£50.10
3,Sharp Objects,Â£47.82
4,Sapiens: A Brief History of Humankind,Â£54.23


In [52]:
df.tail()

Unnamed: 0,title,price
15,Our Band Could Be Your Life: Scenes from the A...,Â£57.25
16,Olio,Â£23.88
17,Mesaerion: The Best Science Fiction Stories 18...,Â£37.59
18,Libertarianism for Beginners,Â£51.33
19,It's Only the Himalayas,Â£45.17


In [53]:
df.dtypes

title    object
price    object
dtype: object

In [54]:
#checking null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20 non-null     object
 1   price   20 non-null     object
dtypes: object(2)
memory usage: 448.0+ bytes


# CLEANING DATA

I need a new dataframe without the symbols.Then I can convert the price datatype object in to an integer using the int() function and create a new DataFrame with the cleaned price.

In [55]:
# Create an empty dataframe
columns = ['title', 'price']
df = pd.DataFrame(columns=columns)

for book in books:
    title = book.h3.a['title']
    price = book.select('.price_color')[0].get_text()
    price = price.replace("Â£", "")
    price = int(float(price))
    df = df.append({'title': title, 'price': price}, ignore_index=True)

# Create new DataFrame
df2 = pd.DataFrame(df, columns=['title', 'price'])


In [56]:
df2

Unnamed: 0,title,price
0,A Light in the Attic,51
1,Tipping the Velvet,53
2,Soumission,50
3,Sharp Objects,47
4,Sapiens: A Brief History of Humankind,54
5,The Requiem Red,22
6,The Dirty Little Secrets of Getting Your Dream...,33
7,The Coming Woman: A Novel Based on the Life of...,17
8,The Boys in the Boat: Nine Americans and Their...,22
9,The Black Maria,52


In [57]:
df2.dtypes

title    object
price    object
dtype: object

The price dtype is still object,it dosen't changed because the issue is that the .append() method is returning a new DataFrame with the same data type as the original, which is object.

So here,I am using the .astype() method to convert the data type of the 'price' column from 'object' to 'int':

In [58]:
df2['price'] = df2['price'].astype(int)

In [59]:
df2.dtypes

title    object
price     int64
dtype: object

In [60]:
df2.info

<bound method DataFrame.info of                                                 title  price
0                                A Light in the Attic     51
1                                  Tipping the Velvet     53
2                                          Soumission     50
3                                       Sharp Objects     47
4               Sapiens: A Brief History of Humankind     54
5                                     The Requiem Red     22
6   The Dirty Little Secrets of Getting Your Dream...     33
7   The Coming Woman: A Novel Based on the Life of...     17
8   The Boys in the Boat: Nine Americans and Their...     22
9                                     The Black Maria     52
10     Starving Hearts (Triangular Trade Trilogy, #1)     13
11                              Shakespeare's Sonnets     20
12                                        Set Me Free     17
13  Scott Pilgrim's Precious Little Life (Scott Pi...     52
14                          Rip it Up and Start Again

# SAVING THE FILE

In [61]:
df2.to_csv('books.csv', index=False)
