#### Data intake and libraries setup

In [1]:
#import libraries

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import re
import string
from time import sleep

In [2]:
#import input dataset
input_data = pd.read_csv("Input.xlsx - Sheet1.csv")


#import positive and negative words
pos_words = pd.read_csv('positive-words.txt', header=None, names=['positive_words'])
neg_words = pd.read_csv('negative-words.txt', header=None, names=['negative_words'])


#convert postive and negative dataframe into list
positive_words = list(pos_words['positive_words'])
negative_words = list(neg_words['negative_words'])


#import stopwrods
auditor = pd.read_csv('StopWords_Auditor.txt',header=None, names=['words'])
currencies = pd.read_csv('StopWords_Currencies.txt',header=None, names=['words'])
dates_numbers = pd.read_csv('StopWords_DatesandNumbers.txt',header=None, names=['words'])
generic = pd.read_csv('StopWords_Generic.txt',header=None, names=['words'])
genericlong = pd.read_csv('StopWords_GenericLong.txt',header=None, names=['words'])
geog = pd.read_csv('StopWords_Geographic.txt',header=None, names=['words'])
names = pd.read_csv('StopWords_Names.txt',header=None, names=['words'])


#combine all stopwrds in one dataset
df_stopwords = pd.concat([auditor,currencies,dates_numbers,generic,genericlong,geog,names], axis=0)

#create a list of stopwords from dataframe
stopwords = list(df_stopwords['words'].str.lower())

In [3]:
#check the dimension of the dataset

input_data.shape

(114, 2)

In [4]:
#take a look at first 5 rows from data

input_data.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


#### Data Extraction

In [5]:
#create an empty list to store all data
scraped_data = []

#go through each links save data in another dataset

for link in input_data['URL']:
    url = link
    req = requests.get(url)
    soup = bs(req.content, 'html.parser')
    
    #extract, clean and store title
    try:
        title = soup.find('h1').get_text()
    except AttributeError:
        pass
    
    #extract, clean and store paragraphs into one single text
    try:
        paragraph = ''
        for i in soup.find_all('div', class_="td-post-content tagdiv-type"):
            paragraph = paragraph + i.get_text()
        paragraph = paragraph.replace('\n','')
    except AttributeError:
        pass
    
       
    #store data in a list arranged in the form of dictionary and convert to dataframe later
    scraped_data.append({
                            'link':link,
                            'title':title,
                            'paragraph':paragraph
                })


In [6]:
#covnvert scraped data list into pandas dataframe

output_data = pd.DataFrame(scraped_data)
output_data.head()

Unnamed: 0,link,title,paragraph
0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"Telemedicine, the use of technology to diagnos..."
1,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,"The rise of e-health, or the use of electronic..."
2,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,
3,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains on quality, affordability and acce..."
4,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains on quality, affordability and acce..."


In [7]:
#check the shape of output dataset

output_data.shape

(114, 3)

#### Data Cleaning

In [8]:
#convert the text in paragraph into lower case

output_data['paragraph_lower'] = output_data['paragraph'].str.lower()

In [9]:
#check the result

output_data['paragraph_lower']

0      telemedicine, the use of technology to diagnos...
1      the rise of e-health, or the use of electronic...
2                                                       
3      “more gains on quality, affordability and acce...
4      “more gains on quality, affordability and acce...
                             ...                        
109    before jumping on the topic i would like to gi...
110    as the coronavirus spreads around the world an...
111    from alibaba to ping an and google to ford, co...
112    whenthe british ruled india, many indiansaccep...
113    the business of business is no longer to do ju...
Name: paragraph_lower, Length: 114, dtype: object

In [10]:
#remove punctuations and store in a different field

output_data['paragraph_cleaned'] = output_data['paragraph_lower'].str.translate(str.maketrans('','',string.punctuation))

In [11]:
#check the result

output_data['paragraph_cleaned']

0      telemedicine the use of technology to diagnose...
1      the rise of ehealth or the use of electronic m...
2                                                       
3      “more gains on quality affordability and acces...
4      “more gains on quality affordability and acces...
                             ...                        
109    before jumping on the topic i would like to gi...
110    as the coronavirus spreads around the world an...
111    from alibaba to ping an and google to ford com...
112    whenthe british ruled india many indiansaccept...
113    the business of business is no longer to do ju...
Name: paragraph_cleaned, Length: 114, dtype: object

In [12]:
#in above output there is a '“' symbol apperaing in row 3 and 4, and maybe in more rows down the line. We need to clean that...

output_data['paragraph_cleaned'] = output_data['paragraph_cleaned'].str.replace('“','').str.replace('”',' ').str.replace(',','')

In [13]:
#check the result --- looks good

output_data['paragraph_cleaned']

0      telemedicine the use of technology to diagnose...
1      the rise of ehealth or the use of electronic m...
2                                                       
3      more gains on quality affordability and access...
4      more gains on quality affordability and access...
                             ...                        
109    before jumping on the topic i would like to gi...
110    as the coronavirus spreads around the world an...
111    from alibaba to ping an and google to ford com...
112    whenthe british ruled india many indiansaccept...
113    the business of business is no longer to do ju...
Name: paragraph_cleaned, Length: 114, dtype: object

In [14]:
#create a field to store tokenized paragrapgh
output_data['paragraph_tokenized'] = np.nan

#tokenize the paragraph
for para in range(len(output_data)):
    output_data['paragraph_tokenized'][para] = word_tokenize(output_data['paragraph_cleaned'][para],"english")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['paragraph_tokenized'][para] = word_tokenize(output_data['paragraph_cleaned'][para],"english")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [15]:
#create a field which will store words from tokenized column after removing stopwords
output_data['paragraph_no_stopwords'] = ''


#tokenize the paragraph
for para in range(len(output_data)):
    
    #creaing a empty list to store all words after removing stopwrods
    no_stopwords = []
    for word in output_data['paragraph_tokenized'][para]:
        if word not in stopwords:
            no_stopwords.append(word)
    
    #store list in a separate field
    output_data['paragraph_no_stopwords'][para] = no_stopwords       
            

In [16]:
#check the output dataset after all the cleaning

output_data.tail()

Unnamed: 0,link,title,paragraph,paragraph_lower,paragraph_cleaned,paragraph_tokenized,paragraph_no_stopwords
109,https://insights.blackcoffer.com/coronavirus-i...,Coronavirus: Impact on the Hospitality Industry,Before jumping on the topic I would like to gi...,before jumping on the topic i would like to gi...,before jumping on the topic i would like to gi...,"[before, jumping, on, the, topic, i, would, li...","[jumping, topic, give, overview, coronavirus, ..."
110,https://insights.blackcoffer.com/coronavirus-i...,Coronavirus impact on energy markets,As the coronavirus spreads around the world an...,as the coronavirus spreads around the world an...,as the coronavirus spreads around the world an...,"[as, the, coronavirus, spreads, around, the, w...","[coronavirus, spreads, world, countries, imple..."
111,https://insights.blackcoffer.com/what-are-the-...,What are the key policies that will mitigate t...,"From Alibaba to Ping An and Google to Ford, co...","from alibaba to ping an and google to ford, co...",from alibaba to ping an and google to ford com...,"[from, alibaba, to, ping, an, and, google, to,...","[alibaba, google, companies, globe, telling, s..."
112,https://insights.blackcoffer.com/marketing-dri...,Marketing Drives Results With A Focus On Problems,"Whenthe British ruled India, many Indiansaccep...","whenthe british ruled india, many indiansaccep...",whenthe british ruled india many indiansaccept...,"[whenthe, british, ruled, india, many, indians...","[whenthe, ruled, indiansaccepted, work, policy..."
113,https://insights.blackcoffer.com/continued-dem...,Continued Demand for Sustainability,The business of business is no longer to do ju...,the business of business is no longer to do ju...,the business of business is no longer to do ju...,"[the, business, of, business, is, no, longer, ...","[business, business, longer, businessor, incre..."


#### Data Analysis

1. Positive Score

In [17]:
#first we will create postive score column
output_data['Positive_Score'] = np.nan

#calculate values for postive score
for row in range(len(output_data)):
    
    #create a varibale with 0 value to store value of positive words count
    score = 0
    
    #iterate over all words to find number of postive words
    for word in output_data['paragraph_no_stopwords'][row]:
        if word in positive_words:
            score += 1
    
    #store the value of score in following field
    output_data['Positive_Score'][row] = score
    
            
            
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Positive_Score'][row] = score


2. negative Score

In [18]:
#create postive score column
output_data['Negative_Score'] = np.nan

#calculate values for postive score
for row in range(len(output_data)):
    
    #create a varibale with 0 value to store value of positive words count
    score = 0
    
    #iterate over all words to find number of postive words
    for word in output_data['paragraph_no_stopwords'][row]:
        if word in negative_words:
            score += 1
    
    #store the value of score in following field
    output_data['Negative_Score'][row] = score
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Negative_Score'][row] = score


3. Polarity Score

In [19]:
#calculate polarity score by the formula provided

output_data['Polarity_Score'] = (output_data['Positive_Score'] - output_data['Negative_Score']) / ((output_data['Positive_Score'] + output_data['Negative_Score']) + 0.000001)

4. Subjectivity Score

In [20]:
#calculate subjectivity score by the formula provided

output_data['Subjectivity_Score'] = np.nan

for row in range(len(output_data)):
    
    output_data['Subjectivity_Score'][row] = (output_data['Positive_Score'][row] + output_data['Negative_Score'][row]) / (len(output_data['paragraph_no_stopwords'][row]) + 0.000001)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Subjectivity_Score'][row] = (output_data['Positive_Score'][row] + output_data['Negative_Score'][row]) / (len(output_data['paragraph_no_stopwords'][row]) + 0.000001)


5. Average Sentence Length

In [21]:
#calculationg avarage length by given formula

output_data['Average_Sentence_Length'] = np.nan

for row in range(len(output_data)):
    
    output_data['Average_Sentence_Length'][row] = len(output_data['paragraph_no_stopwords'][row]) / len(output_data['paragraph_lower'][row].split('.'))
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Average_Sentence_Length'][row] = len(output_data['paragraph_no_stopwords'][row]) / len(output_data['paragraph_lower'][row].split('.'))


6. Complex Words Count

In [22]:
#create a column for storing count of complex words
output_data['Complex_Words_Count'] = np.nan


#count number of complex words for each paragraph
for para in range(len(output_data)):
    
    mylist = output_data['paragraph_no_stopwords'][para]

    count = 0

    for myword in mylist:
        d = {}.fromkeys('aeiou',0)
        haslotsvowels = False
        for x in myword.lower():
            if x in d:
                d[x] += 1
        for q in d.values():
            if q > 2:
                haslotsvowels = True
        if haslotsvowels:
            count += 1

    output_data['Complex_Words_Count'][para] = count

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Complex_Words_Count'][para] = count


7. Number of Words

In [23]:
#create a coulm to store number of words
output_data['Word_Count'] = np.nan


#count number of number of words for each paragraph
for para in range(len(output_data)):
    output_data['Word_Count'][para] = len(output_data['paragraph_no_stopwords'][para])
    


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Word_Count'][para] = len(output_data['paragraph_no_stopwords'][para])


8. Percentage of Complex Words

In [24]:
#percecntage of complex words is calculated by following formula

output_data['Percentage_of_Complex_Words'] = output_data['Complex_Words_Count'] / output_data['Word_Count']

9. Fog Index

In [25]:
#fog index is calculated by following formula

output_data['Fog_Index'] = 0.4 * (output_data['Percentage_of_Complex_Words'] + output_data['Average_Sentence_Length'])

10. Syllable Count Per Word

In [26]:
#create a column for storing count of syllable counts per word
output_data['Syllable_Count_per_Word'] = np.nan


#count number of syllable for each paragraph
for para in range(len(output_data)):
    
    mylist = output_data['paragraph_no_stopwords'][para]

    count = 0

    for myword in mylist:
        for char in myword:
            if char in ['a','e','i','o','u']:
                count += 1
                
    try:
        output_data['Syllable_Count_per_Word'][para] = count / len(output_data['paragraph_no_stopwords'][para])
    except ZeroDivisionError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Syllable_Count_per_Word'][para] = count / len(output_data['paragraph_no_stopwords'][para])


11. Personal Pronouns

In [27]:
#we will first define all the pronouns using regex
pronounRegex = re.compile(r'\b(i|we|my|you|your|his|her|he|she|it|they|them|him|their|ours|(?-i:us))\b',re.I)

#creata a column to store count of all personal pronouns
output_data['Personal_Pronouns'] = np.nan

#search for all pronouns in each paragraph
for para in range(len(output_data)):
    
    pronouns = pronounRegex.findall(output_data['paragraph_cleaned'][para])
    output_data['Personal_Pronouns'][para] = len(pronouns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Personal_Pronouns'][para] = len(pronouns)


12. Average Word Length

In [28]:
#create a column to store average word length
output_data['Average_Word_Length'] = np.nan

#average word length is calculated by following given formula
for para in range(len(output_data)):
    
    try:
        output_data['Average_Word_Length'][para] = len(output_data['paragraph_cleaned'][para].replace(' ','')) / len(output_data['paragraph_tokenized'][para])
    except ZeroDivisionError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['Average_Word_Length'][para] = len(output_data['paragraph_cleaned'][para].replace(' ','')) / len(output_data['paragraph_tokenized'][para])


In [29]:
#### Lets check the final output before preparing the final dataset for submission
output_data.head()

Unnamed: 0,link,title,paragraph,paragraph_lower,paragraph_cleaned,paragraph_tokenized,paragraph_no_stopwords,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Average_Sentence_Length,Complex_Words_Count,Word_Count,Percentage_of_Complex_Words,Fog_Index,Syllable_Count_per_Word,Personal_Pronouns,Average_Word_Length
0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"Telemedicine, the use of technology to diagnos...","telemedicine, the use of technology to diagnos...",telemedicine the use of technology to diagnose...,"[telemedicine, the, use, of, technology, to, d...","[telemedicine, technology, diagnose, patients,...",74.0,21.0,0.557895,0.116137,9.402299,118.0,818.0,0.144254,3.818621,3.262836,44.0,5.677223
1,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,"The rise of e-health, or the use of electronic...","the rise of e-health, or the use of electronic...",the rise of ehealth or the use of electronic m...,"[the, rise, of, ehealth, or, the, use, of, ele...","[rise, ehealth, electronic, facilitate, health...",38.0,13.0,0.490196,0.180212,11.32,25.0,283.0,0.088339,4.563336,3.229682,17.0,5.50165
2,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,,,,[],[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,
3,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains on quality, affordability and acce...","“more gains on quality, affordability and acce...",more gains on quality affordability and access...,"[more, gains, on, quality, affordability, and,...","[gains, quality, affordability, accessibility,...",34.0,26.0,0.133333,0.09539,10.661017,61.0,629.0,0.096979,4.303199,2.877583,24.0,5.558101
4,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains on quality, affordability and acce...","“more gains on quality, affordability and acce...",more gains on quality affordability and access...,"[more, gains, on, quality, affordability, and,...","[gains, quality, affordability, accessibility,...",34.0,26.0,0.133333,0.09539,10.661017,61.0,629.0,0.096979,4.303199,2.877583,24.0,5.558101


#### Preparation of output dataset

In [30]:
#concat input and ouput dataset in order to get URL_ID and URL in output dataset

output_data = pd.concat([input_data,output_data], axis=1)

In [31]:
#keep only required columns and store in output dataset

Output_Data_Structure = output_data[['URL_ID', 'URL', 'title', 'paragraph',
                                     'Positive_Score','Negative_Score', 'Polarity_Score', 'Subjectivity_Score',
                                     'Average_Sentence_Length', 'Percentage_of_Complex_Words', 'Fog_Index',
                                     'Complex_Words_Count', 'Word_Count', 'Syllable_Count_per_Word',
                                     'Personal_Pronouns', 'Average_Word_Length']]

In [32]:
#export the file in excel sheet

Output_Data_Structure.to_excel('Output_Data_Structure.xlsx')

### Done!!