# **Web Scraping for Company Insights & Predicting Customer Buying Behaviours**

### **Import Libraries**

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### **Web Scraping**

In [5]:
base_url = "https://www.airlinequality.com/airline-reviews/air-india/"
pages = 10
page_size = 100

data = []
for i in range(1, pages + 1):

    print(f"Scraping page {i}")
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    response = requests.get(url)
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        data.append(para.get_text())

    print(f"   ---> {len(data)} total reviews")


Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [6]:
df = pd.DataFrame()
df["data"] = data
df.head()

Unnamed: 0,data
0,✅ Trip Verified | Requested a gluten free mea...
1,Not Verified | Worst planes I have ever been i...
2,Not Verified | I have been flying for the past...
3,✅ Trip Verified | Sometime around 2017 I got t...
4,Not Verified | My flight was supposed to depa...


In [7]:
df.to_csv("data.csv")
df.head()

Unnamed: 0,data
0,✅ Trip Verified | Requested a gluten free mea...
1,Not Verified | Worst planes I have ever been i...
2,Not Verified | I have been flying for the past...
3,✅ Trip Verified | Sometime around 2017 I got t...
4,Not Verified | My flight was supposed to depa...


## **Data Preprocessing**

### **Data Cleaning**

**Remove sentences before '|' in data column**

In [8]:
df.data= df.data.str.split('|',expand=True)[1]

In [9]:
df

Unnamed: 0,data
0,Requested a gluten free meal and was given a...
1,Worst planes I have ever been in while taking...
2,I have been flying for the past 21 years and ...
3,Sometime around 2017 I got the patriotism bug...
4,My flight was supposed to depart New Delhi a...
...,...
995,
996,
997,
998,


**Remove all special characters**

In [10]:
# import request as re
import re

In [11]:
def clean(text):
    text = re.sub('[^A-Za-z]+]', ' ', str(text))
    return text

In [12]:
df['clean_data'] = df['data'].apply(clean)
df.head()

Unnamed: 0,data,clean_data
0,Requested a gluten free meal and was given a...,Requested a gluten free meal and was given a...
1,Worst planes I have ever been in while taking...,Worst planes I have ever been in while taking...
2,I have been flying for the past 21 years and ...,I have been flying for the past 21 years and ...
3,Sometime around 2017 I got the patriotism bug...,Sometime around 2017 I got the patriotism bug...
4,My flight was supposed to depart New Delhi a...,My flight was supposed to depart New Delhi a...


### **Tokenization**

#### **Import Libraries**

In [16]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

#### **POS Tagging**

In [23]:

pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

In [24]:
# converting each token into tuple
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
          newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

In [25]:
df['POS tagged'] = df['clean_data'].apply(token_stop_pos)
df.head()

Unnamed: 0,data,clean_data,POS tagged,Lemma
0,Requested a gluten free meal and was given a...,Requested a gluten free meal and was given a...,"[(Requested, v), (gluten, a), (free, a), (meal...",Requested gluten free meal give regular meal...
1,Worst planes I have ever been in while taking...,Worst planes I have ever been in while taking...,"[(Worst, n), (planes, n), (ever, r), (taking, ...",Worst plane ever take international flight ....
2,I have been flying for the past 21 years and ...,I have been flying for the past 21 years and ...,"[(flying, v), (past, a), (21, None), (years, n...",fly past 21 year never see poorly condition ...
3,Sometime around 2017 I got the patriotism bug...,Sometime around 2017 I got the patriotism bug...,"[(Sometime, r), (around, None), (2017, None), ...",Sometime around 2017 get patriotism bug . av...
4,My flight was supposed to depart New Delhi a...,My flight was supposed to depart New Delhi a...,"[(flight, n), (supposed, v), (depart, v), (New...",flight suppose depart New Delhi 2300hrs . fi...


#### **Lemmatization (Lemma)**

In [26]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

In [27]:
# grouping the words
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
     if not pos:
        lemma = word
        lemma_rew = lemma_rew + " " + lemma
     else:
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

In [28]:
df['Lemma'] = df['POS tagged'].apply(lemmatize)
df.head()

Unnamed: 0,data,clean_data,POS tagged,Lemma
0,Requested a gluten free meal and was given a...,Requested a gluten free meal and was given a...,"[(Requested, v), (gluten, a), (free, a), (meal...",Requested gluten free meal give regular meal...
1,Worst planes I have ever been in while taking...,Worst planes I have ever been in while taking...,"[(Worst, n), (planes, n), (ever, r), (taking, ...",Worst plane ever take international flight ....
2,I have been flying for the past 21 years and ...,I have been flying for the past 21 years and ...,"[(flying, v), (past, a), (21, None), (years, n...",fly past 21 year never see poorly condition ...
3,Sometime around 2017 I got the patriotism bug...,Sometime around 2017 I got the patriotism bug...,"[(Sometime, r), (around, None), (2017, None), ...",Sometime around 2017 get patriotism bug . av...
4,My flight was supposed to depart New Delhi a...,My flight was supposed to depart New Delhi a...,"[(flight, n), (supposed, v), (depart, v), (New...",flight suppose depart New Delhi 2300hrs . fi...


In [29]:
df[['data','Lemma']]

Unnamed: 0,data,Lemma
0,Requested a gluten free meal and was given a...,Requested gluten free meal give regular meal...
1,Worst planes I have ever been in while taking...,Worst plane ever take international flight ....
2,I have been flying for the past 21 years and ...,fly past 21 year never see poorly condition ...
3,Sometime around 2017 I got the patriotism bug...,Sometime around 2017 get patriotism bug . av...
4,My flight was supposed to depart New Delhi a...,flight suppose depart New Delhi 2300hrs . fi...
...,...,...
995,,
996,,
997,,
998,,
