# **Web Scraping for Company Insights & Predicting Customer Buying Behaviours**

### **Import Libraries**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### **Web Scraping**

In [None]:
base_url = "https://www.airlinequality.com/airline-reviews/air-india/"
pages = 10
page_size = 100

data = []
for i in range(1, pages + 1):

    print(f"Scraping page {i}")
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    response = requests.get(url)
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        data.append(para.get_text())

    print(f"   ---> {len(data)} total reviews")


Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [None]:
df = pd.DataFrame()
df["data"] = data
df.head()

Unnamed: 0,data
0,✅ Trip Verified | Requested a gluten free mea...
1,Not Verified | Worst planes I have ever been i...
2,Not Verified | I have been flying for the past...
3,✅ Trip Verified | Sometime around 2017 I got t...
4,Not Verified | My flight was supposed to depa...


In [None]:
df.to_csv("data.csv")
df.head()

Unnamed: 0,data
0,✅ Trip Verified | Requested a gluten free mea...
1,Not Verified | Worst planes I have ever been i...
2,Not Verified | I have been flying for the past...
3,✅ Trip Verified | Sometime around 2017 I got t...
4,Not Verified | My flight was supposed to depa...


## **Data Preprocessing**

### **Data Cleaning**

**Remove sentences before '|' in data column**

In [None]:
df.data= df.data.str.split('|',expand=True)[1]

In [None]:
df

Unnamed: 0,data
0,Requested a gluten free meal and was given a...
1,Worst planes I have ever been in while taking...
2,I have been flying for the past 21 years and ...
3,Sometime around 2017 I got the patriotism bug...
4,My flight was supposed to depart New Delhi a...
...,...
995,
996,
997,
998,


**Remove all special characters**

In [None]:
# import request as re
import re

In [None]:
def clean(text):
    text = re.sub('[^A-Za-z]+]', ' ', str(text))
    return text

In [None]:
df['clean_data'] = df['data'].apply(clean)
df.head()

Unnamed: 0,data,clean_data
0,Requested a gluten free meal and was given a...,Requested a gluten free meal and was given a...
1,Worst planes I have ever been in while taking...,Worst planes I have ever been in while taking...
2,I have been flying for the past 21 years and ...,I have been flying for the past 21 years and ...
3,Sometime around 2017 I got the patriotism bug...,Sometime around 2017 I got the patriotism bug...
4,My flight was supposed to depart New Delhi a...,My flight was supposed to depart New Delhi a...


### **Tokenization**

#### **Import Libraries**

In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
