# Text Classification and Analysis On Extracted Data from website articles.

# Importing Data

In [1]:
import pandas as pd

input_data = pd.read_excel('/kaggle/input/website-urls-for-web-scraping-and-data-analysis/blackcoffer websites link dataset.xlsx')
print(input_data)

     URL_ID                                                URL
0        37  https://insights.blackcoffer.com/ai-in-healthc...
1        38  https://insights.blackcoffer.com/what-if-the-c...
2        39  https://insights.blackcoffer.com/what-jobs-wil...
3        40  https://insights.blackcoffer.com/will-machine-...
4        41  https://insights.blackcoffer.com/will-ai-repla...
..      ...                                                ...
109     146  https://insights.blackcoffer.com/blockchain-fo...
110     147  https://insights.blackcoffer.com/the-future-of...
111     148  https://insights.blackcoffer.com/big-data-anal...
112     149  https://insights.blackcoffer.com/business-anal...
113     150  https://insights.blackcoffer.com/challenges-an...

[114 rows x 2 columns]


# Web Scraping

Creating a function to extract article title and text content using url of article

In [2]:
import requests
from bs4 import BeautifulSoup

def extract_data_in_txt(url, url_id):

  # Making a GET request for the given article link
  r = requests.get(url)

  # Parsing the HTML
  soup = BeautifulSoup(r.content, 'html.parser')
  
  # Finding the title and all div elements to get article contents (text only)
  title = soup.find('title')
  div = soup.find_all('div',  class_ = 'td-post-content tagdiv-type') 

  # Creating a text file with particular url's id as file name
  filename = str(url_id)
  file =  open(filename + ".txt", "w")  #Opening file in write mode

  
  file.write(title.text.strip())  # Removing extra spaces from the title and convertion in lower case before writing in file
  file.write(".")

  for i in div:
    lines = i.find_all('p') #finding all 'p: paragraph' type attributes of article

    # extracting text data line by line and writing into a file
    for line in lines:
      file.write(line.text.strip())  # Removing extra spaces from the text before writing in file
      file.write(".")

  file.close() # Close the file after writing all the text content of an article

#example
extract_data_in_txt(input_data.URL[0], input_data.URL_ID[0])
filename = str(input_data.URL_ID[0])
file =  open(filename + ".txt", "r") 
print(file.read())

AI in healthcare to Improve Patient Outcomes - Blackcoffer Insights.


# Creating text files for each article 
Each article's title and content is stored in a text file as url_id its filename.

In [3]:
def create_files():
  for i in range(len(input_data.URL)):
    extract_data_in_txt(input_data.URL[i], input_data.URL_ID[i])

create_files()

# Sentiment Analysis

# Creating a list of stop words

In [4]:
import re

#filenames for all files containing stop words
path = "/kaggle/input/website-urls-for-web-scraping-and-data-analysis/StopWords_"

auditor = path + "Auditor.txt"
currency = path + "Currencies.txt"
numbers = path + "DatesandNumbers.txt"
generic = path + "Generic.txt"
generic_long = path + "GenericLong.txt"
geographic = path + "Geographic.txt"
names = path + "Names.txt"

filenames = [auditor, currency, numbers, generic, generic_long, geographic, names]

# Create a list to contain all the stop words
stop_words = []

for fname in filenames:
    df = pd.read_csv(fname, sep = "|", encoding='latin-1', header = None)
    word_list = [str(i).lower().strip() for i in df[0]]
    stop_words.extend(word_list)
    
# printing all the stop words
print(stop_words)

['ernst', 'young', 'deloitte', 'touche', 'kpmg', 'pricewaterhousecoopers', 'pricewaterhouse', 'coopers', 'afghani', 'ariary', 'baht', 'balboa', 'birr', 'bolivar', 'boliviano', 'cedi', 'colon', 'córdoba', 'dalasi', 'denar', 'dinar', 'dirham', 'dobra', 'dong', 'dram', 'escudo', 'euro', 'florin', 'forint', 'gourde', 'guarani', 'gulden', 'hryvnia', 'kina', 'kip', 'konvertibilna marka', 'koruna', 'krona', 'krone', 'kroon', 'kuna', 'kwacha', 'kwanza', 'kyat', 'lari', 'lats', 'lek', 'lempira', 'leone', 'leu', 'lev', 'lilangeni', 'lira', 'litas', 'loti', 'manat', 'metical', 'naira', 'nakfa', 'new lira', 'new sheqel', 'ngultrum', 'nuevo sol', 'ouguiya', 'pataca', 'peso', 'pound', 'pula', 'quetzal', 'rand', 'real', 'renminbi', 'rial', 'riel', 'ringgit', 'riyal', 'ruble', 'rufiyaa', 'rupee', 'rupee', 'rupiah', 'shilling', 'som', 'somoni', 'special drawing rights', 'taka', 'tala', 'tenge', 'tugrik', 'vatu', 'won', 'yen', 'zloty', 'hundred', 'thousand', 'million', 'billion', 'trillion', 'date', 'an

# Creating a dictionary of positive and negative words

In [5]:
path = "/kaggle/input/website-urls-for-web-scraping-and-data-analysis/"

positive = path + "positive-words.txt"
negative = path + "negative-words.txt"

# Create lists for positive and negative words
pos_words = []
neg_words = []

df = pd.read_csv(positive, sep = " ", encoding='latin-1', header = None)
pos_words = [str(i).lower().strip() for i in df[0] if(str(i).lower().strip() not in stop_words)]
    

df = pd.read_csv(negative, sep = " ", encoding='latin-1', header = None)
neg_words = [str(i).lower().strip() for i in df[0] if(str(i).lower().strip() not in stop_words)]

dictionary = {'positive': pos_words,
                      'negative': neg_words}

# printing all the positive and negative words
print(dictionary)



# Data Cleaning

In [6]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
[0m

# Expanding contractions

In [7]:
import contractions

#Expanding contractions such as I'll to I will etc.
def expand_contractions(s):
    return contractions.fix(s)

# Lemmatizing sentence: converting words in their lemma(root) form

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')
 
# Create a Doc object
def lemmatize_sentence(sentence):
    doc = nlp(sentence)
    # Create list of tokens from given string
    tokens = []
    for token in doc:
        tokens.append(token)
 
    lemmatized_sentence = " ".join([token.lemma_ for token in doc])
    return lemmatized_sentence 



In [9]:
def clean_data(clean_sentence):
  
  clean_sentence = expand_contractions(clean_sentence)

  clean_sentence = re.sub(r'[^\w\s]', '', clean_sentence)  #removing punctuations like "/", ";" "[", "]" "=", "#" etc.

  clean_sentence = lemmatize_sentence(clean_sentence)

  return clean_sentence

# Extracting Derived variables

We convert the text into a list of tokens using the nltk tokenize module and use these tokens to calculate the 4 variables described below:

1. **Positive Score**: This score is calculated by assigning the value of +1 for each word if found in the Positive Dictionary and then adding up all the values.
2. **Negative Score**: This score is calculated by assigning the value of -1 for each word if found in the Negative Dictionary and then adding up all the values. We multiply the score with -1 so that the score is a positive number.
3. **Polarity Score**: This is the score that determines if a given text is positive or negative in nature. It is calculated by using the formula: 
Polarity Score = (Positive Score – Negative Score)/ ((Positive Score + Negative Score) + 0.000001)
Range is from -1 to +1
4. **Subjectivity Score**: This is the score that determines if a given text is objective or subjective. It is calculated by using the formula:Subjectivity Score = (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001). Range is from 0 to +1 


In [10]:
from nltk import word_tokenize

def extracting_derived_scores(clean_sentence):

  clean_sentence = clean_sentence.lower() 

  clean_sentence = clean_data(clean_sentence)

  clean_sentence = clean_sentence.split(" ")  #tokenization: splitting sentence into words

  clean_lst = []  #removing stop words like if, but, or etc. and removing characters of length 1

  for word in clean_sentence:
    if (word not in stop_words):
      clean_lst.append(word)

  positivity_score = 0
  negativity_score = 0

  for word in clean_lst:
    if(word in pos_words):
      positivity_score += 1
    if(word in neg_words):
      negativity_score += 1
    else:
      continue

  polarity_score = (positivity_score - negativity_score) / ((positivity_score + negativity_score) + 0.000001)
  subjectivity_score = (positivity_score + negativity_score) / ((len(clean_lst)) + 0.000001)

  scores = [positivity_score, negativity_score, polarity_score, subjectivity_score]

  return scores


In [11]:
#Creating a list filenames, containing filename of every article

path = "/kaggle/working/"

filenames = [path+str(i)+".txt" for i in input_data.URL_ID]

In [12]:
#storing these 4 scores in seperate lists
#each list contains that particular score for all the text files.

positivity_score = [] 
negativity_score = [] 
polarity_score = [] 
subjectivity_score = []

for fname in filenames:
  file = open(fname, 'r')
  text = file.read()
  score = extracting_derived_scores(text)
  positivity_score.append(score[0])
  negativity_score.append(score[1])
  polarity_score.append(score[2])
  subjectivity_score.append(score[3])


# Average Sentence Length

Sum of sentence lengths / Total no. of sentences

In [13]:
avg_sentence_len = []  #list to score avg sentence length of each document

for fname in filenames:

  file = open(fname, 'r')
  text = file.read() #extracting all the file content in a string

  sentence_list = text.split(".") #split function will return a list containting sentences seperated by full-stop
  
  sentence_lengths = [len(i) for i in sentence_list]

  sentence_count = len(sentence_list)

  avg_sentence_len.append(sum(sentence_lengths) / sentence_count)


# Average No. of Words per Sentence

Sum of no. of words in a sentence / Total no. of sentences

In [14]:
avg_words_per_sentence = [] #list to score avg no. of words per sentence for each document

for fname in filenames:

  words_in_sentence_count = []

  file = open(fname, 'r')
  text = file.read() #extracting all the file content in a string

  sentence_list = text.split(".") #split function will return a list containting sentences seperated by full-stop
  
  for i in sentence_list:
    word_count = 0
    for word in i.split(" "):
      word_count += 1
    words_in_sentence_count.append(word_count)
  
  sentence_count = len(sentence_list)

  avg_words_per_sentence.append(sum(words_in_sentence_count) / sentence_count)


# Average Word Length

Sum of lengths of words / total words in a document

In [15]:
avg_words_length = [] #list to score avg word length of each document

for fname in filenames:
  
  word_lengths = []

  file = open(fname, 'r')
  text = file.read()

  text = re.sub(r'\.', '', text)
  
  text = text.split(" ")

  for i in text:
    word_lengths.append(len(i))
    
  avg_words_length.append(sum(word_lengths) / len(text))

# Word Count after Data Cleaning

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
import re

word_count = []

for fname in filenames:

  file = open(fname, 'r')
  text = file.read()
    
  text = re.sub(r'[^\w\s]', '', text)  #removing punctuations like "/", ";" "[", "]" "=", "#" etc.       
  text = text.split()  #tokenization: splitting sentence into words
    
  stop_words = set(stopwords.words('english'))
  clean_lst = []  #removing stop words like if, but, or etc. and removing characters of length 1
  for word in text:
      if (word not in stop_words):
          if(len(word) != 1):
              clean_lst.append(word)
              
  word_count.append(len(clean_lst))

# Personal Pronouns

In [18]:
personal_pronouns = re.compile(r'\b(I|we|ours|my|mine|(?-i:us))\b', re.I)
pronoun_count = [] #list to contain the no. of personal pronouns in each document

for fname in filenames:

  file = open(fname, 'r')
  text = file.read()
    
  pronouns = personal_pronouns.findall(text) 

  pronoun_count.append(len(pronouns))
 

# Average Syllable Count per Word

Sum of syllable count in each word / total no. of words

In [19]:
avg_syllable_count = []

for fname in filenames:

  file = open(fname, 'r')
  text = file.read()
  
  sentence_list = text.split(".") #split function will return a list containting sentences seperated by full-stop
  vowels = 'aeiou'

  text = re.sub(r'\.', '', text)
  
  text = text.split(" ")

  syl_count = 0  
  
  for word in text:
    syllables = re.findall(f'(?!e$)(?!es$)(?!ed$)[{vowels}]', word, re.I)
    syl_count += len(syllables)

  avg_syllable_count.append(syl_count / len(text))
  
 

# Complex Word Count: Word containing two or more syllables

In [20]:
complex_word_count = []
word_count = []

for fname in filenames:

  file = open(fname, 'r')
  text = file.read()
  
  sentence_list = text.split(".") #split function will return a list containting sentences seperated by full-stop

  text = re.sub(r'\.', '', text)
  
  words_list = text.split(" ")
  word_count.append(len(words_list))

  c = 0
  for word in words_list:
    l = re.findall('(?!e$)[aeiou]+', word, re.I)+re.findall('^[aeiouy]*e$', word, re.I)
    if len(l) > 2:
      c += 1
  complex_word_count.append(c)


# Percentage of Complex words:
 (the number of complex words / the number of words) *100

In [21]:
complex_words_percent = []
for i in range(len(complex_word_count)):
  complex_words_percent.append((complex_word_count[i]*100)/word_count[i])


# Fog Index:
0.4 * (Average Sentence Length + Percentage of Complex words)

In [22]:
fog_index = []

for i in range(len(avg_sentence_len)):
  fog_index.append((complex_words_percent[i] + avg_sentence_len[i])*0.4)


# Creating Output DataFrame

In [23]:
output_data = input_data.copy()

output_data['Positive Score'] = pd.Series(positivity_score)
output_data['Negative Score'] = pd.Series(negativity_score)
output_data['Polarity Score'] = pd.Series(polarity_score)
output_data['Subjectivity Score'] = pd.Series(subjectivity_score)

output_data['Avg Sentence Length'] = pd.Series(avg_sentence_len)
output_data['Avg No of Words per Sentence'] = pd.Series(avg_words_per_sentence)
output_data['Avg Word Length'] = pd.Series(avg_words_length)

output_data['Personal Pronoun Count'] = pd.Series(pronoun_count)
output_data['Word Count'] = pd.Series(word_count)
output_data['Average Syllable Count'] = pd.Series(avg_syllable_count)
output_data['Complex Word Count'] = pd.Series(complex_word_count)

output_data['Percentage of Complex Words'] = pd.Series(complex_words_percent)

output_data['Fog Index'] = pd.Series(fog_index)

print(output_data)
  


     URL_ID                                                URL  \
0        37  https://insights.blackcoffer.com/ai-in-healthc...   
1        38  https://insights.blackcoffer.com/what-if-the-c...   
2        39  https://insights.blackcoffer.com/what-jobs-wil...   
3        40  https://insights.blackcoffer.com/will-machine-...   
4        41  https://insights.blackcoffer.com/will-ai-repla...   
..      ...                                                ...   
109     146  https://insights.blackcoffer.com/blockchain-fo...   
110     147  https://insights.blackcoffer.com/the-future-of...   
111     148  https://insights.blackcoffer.com/big-data-anal...   
112     149  https://insights.blackcoffer.com/business-anal...   
113     150  https://insights.blackcoffer.com/challenges-an...   

     Positive Score  Negative Score  Polarity Score  Subjectivity Score  \
0                 2               0        1.000000            0.250000   
1                70              38        0.296296      

# Output data frame to excel sheet

In [24]:
output_data.to_excel('Output Data Structure.xlsx', index = False)

In [25]:
output_data.head(10)

Unnamed: 0,URL_ID,URL,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Avg Sentence Length,Avg No of Words per Sentence,Avg Word Length,Personal Pronoun Count,Word Count,Average Syllable Count,Complex Word Count,Percentage of Complex Words,Fog Index
0,37,https://insights.blackcoffer.com/ai-in-healthc...,2,0,1.0,0.25,33.5,5.5,5.8,0,10,2.0,2,20.0,21.4
1,38,https://insights.blackcoffer.com/what-if-the-c...,70,38,0.296296,0.201117,96.402299,17.057471,5.0,7,1398,1.645923,240,17.167382,45.427872
2,39,https://insights.blackcoffer.com/what-jobs-wil...,67,40,0.252336,0.135615,97.857143,15.910714,5.559545,3,1671,1.878516,405,24.236984,48.837651
3,40,https://insights.blackcoffer.com/will-machine-...,54,27,0.333333,0.13799,82.79646,14.911504,4.948506,17,1573,1.678322,241,15.321043,39.247001
4,41,https://insights.blackcoffer.com/will-ai-repla...,56,25,0.382716,0.111264,94.243243,16.054054,5.257177,16,1672,1.719498,321,19.198565,45.376723
5,42,https://insights.blackcoffer.com/man-and-machi...,44,26,0.257143,0.133588,106.366197,18.295775,5.145647,21,1229,1.698129,210,17.087063,49.381304
6,43,https://insights.blackcoffer.com/in-future-or-...,27,10,0.459459,0.115265,83.907407,14.537037,5.191257,7,732,1.684426,157,21.448087,42.142198
7,44,https://insights.blackcoffer.com/how-neural-ne...,0,0,0.0,0.0,18.5,3.5,5.333333,0,6,1.5,1,16.666667,14.066667
8,45,https://insights.blackcoffer.com/how-machine-l...,31,14,0.377778,0.15625,94.704545,16.909091,4.945792,2,701,1.599144,103,14.693295,43.759136
9,46,https://insights.blackcoffer.com/deep-learning...,69,32,0.366337,0.111849,113.570175,19.684211,5.076021,11,2131,1.755514,401,18.817457,52.955053
