## Required packages and libraries

In [None]:
!pip install beautifulsoup4
!pip install requests
!pip install urllib

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from tqdm import tqdm
import json
import re
import pandas as pd

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Text Extraction

run:
!python text_extraction.py -h for help

In [None]:
!python /content/drive/MyDrive/Text_EA/Code/text_extraction.py -p /content/drive/MyDrive/Text_EA/Input.xlsx -d /content/drive/MyDrive/Text_EA/extracted_data

Extracting...:   6% 7/114 [00:07<01:51,  1.04s/it]
Maybe the site with url_id: 44.0 and url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ is empty or broken
Extracting...:  18% 20/114 [00:20<01:35,  1.01s/it]
Maybe the site with url_id: 57.0 and url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ is empty or broken
Extracting...:  94% 107/114 [01:47<00:06,  1.01it/s]
Maybe the site with url_id: 144.0 and url: https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/ is empty or broken
Extracting...: 100% 114/114 [01:54<00:00,  1.00s/it]

Extraction done from all the given links and the text files have been saved at /content/drive/MyDrive/Text_EA/extracted_data


### Getting url_ids of not working sites and deleting their row in the excel file

In [None]:
# getting working url ids

file1 = open('/content/drive/MyDrive/Text_EA/working_url_id.json', 'r')
working_urls = json.load(file1)

#getting not working url id list

not_working_url_id = []
for url_id in list(range(37, 151)):
  if url_id not in working_urls:
    not_working_url_id.append(url_id)
  else:
    continue

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Text_EA/Output Data Structure.xlsx')

In [None]:
df.set_index(['URL_ID'], inplace = True)

In [None]:
df.drop(not_working_url_id, axis = 0,inplace = True)
display(df)

In [None]:
df.to_excel('/content/drive/MyDrive/Text_EA/new Output Data Structure.xlsx')

## Text Analysis

### Removing stopwords

In [None]:
os.chdir('/content/drive/MyDrive/bc_assessment/Code')

In [None]:
!python stopwords_remover.py -s /content/drive/MyDrive/Text_EA/stopwords -e /content/drive/MyDrive/Text_EA/extracted_data -d /content/drive/MyDrive/Text_EA/filtered_data

----Storing all stopwords file in a single file----

Loading:   0% 0/9 [00:00<?, ?it/s]Loading: 100% 9/9 [00:00<00:00, 362.21it/s]

----Filtering the extracted files----

Loading: 100% 111/111 [00:08<00:00, 13.11it/s]

----Text files are filtered and stored at /content/drive/MyDrive/Text_EA/filtered_data----


### Creating positive and negative words dictionary

In [None]:
def add_values_in_dict(word_dict, key, list_of_words):
    ''' Append multiple values to a key in 
        the given dictionary '''
    if key not in word_dict:
        word_dict[key] = list()
    word_dict[key].append(list_of_words)
    return word_dict

In [None]:
#creating dictionary

word_dict = {'Positive':[], 'Negative':[]}

In [None]:
path = '/content/drive/MyDrive/Text_EA/MasterDictionary'
path_filtered = '/content/drive/MyDrive/Text_EA/filtered_data'

In [None]:
#getting positive and negative words lists

positive = []
negative = []
for filename in os.listdir(path):
  f = os.path.join(path, filename)
  with open(f, 'r', encoding='latin-1') as file1:
    if filename == 'positive-words.txt':
      positive = (file1.read().split())
    elif filename == 'negative-words.txt':
      negative = (file1.read().split())
    else:
      continue

In [None]:
# adding positive and negative words to the dictionary

for filename in tqdm(os.listdir(path_filtered)):
  f = os.path.join(path_filtered, filename)
  if os.path.isfile(f):
    with open(f, 'r') as file1:
      words = file1.read().split()
      for r in words:
        if r in positive:
          word_dict = add_values_in_dict(word_dict, 'Positive', r)
        elif r in negative:
          word_dict = add_values_in_dict(word_dict, 'Negative', r)
        else:
          continue

100%|██████████| 111/111 [00:07<00:00, 13.91it/s]


In [None]:
file1 = open('/content/drive/MyDrive/Text_EA/dictionary/dictionary.json', 'w')
json.dump(word_dict, file1)
file1.close()

### Word and Sentence tokenization

In [None]:
path_filtered = '/content/drive/MyDrive/Text_EA/filtered_data'
path_dest = '/content/drive/MyDrive/Text_EA/tokens'

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

for filename in tqdm(os.listdir(path_filtered)):
  f = os.path.join(path_filtered, filename)
  with open(f, 'r') as file1:
    text = file1.read()
    file1.close()
  base, ext = os.path.splitext(filename)
  word_tokens = word_tokenize(text)
  sent_tokens = sent_tokenize(text)
  
  word_destination = path_dest + '/word_tokens/wt_' + base + '.json'
  sent_destination = path_dest + '/sent_tokens/st_' + base + '.json'
  
  with open(word_destination, 'w') as a:
    json.dump(word_tokens, a)
  a.close()
  with open(sent_destination,'w') as a:
    json.dump(sent_tokens, a)
  a.close()

100%|██████████| 111/111 [01:05<00:00,  1.69it/s]


### Extracting derived variables:
**Positive Score**: This score is calculated by assigning the value of +1 for each word if found in the Positive Dictionary and then adding up all the values.


**Negative Score**: This score is calculated by assigning the value of -1 for each word if found in the Negative Dictionary and then adding up all the values. We multiply the score with -1 so that the score is a positive number.


**Polarity Score**: This is the score that determines if a given text is positive or negative in nature. It is calculated by using the formula:

Polarity Score = (Positive Score – Negative Score)/ ((Positive Score + Negative Score) + 0.000001)
Range is from -1 to +1


**Subjectivity Score**: This is the score that determines if a given text is objective or subjective. It is calculated by using the formula:

Subjectivity Score = (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001)
Range is from 0 to +1


In [None]:
path_dict = '/content/drive/MyDrive/Text_EA/dictionary/dictionary.json'

In [None]:
# getting positive and negative dictionary
# reading the file will return a text file,
# so load it

dictionary = open(path_dict, 'r')
word_dict = json.load(dictionary)
dictionary.close()

In [None]:
path_wt = '/content/drive/MyDrive/Text_EA/tokens/word_tokens'

In [None]:
p_score = []
n_score = []
polarity_score = []
subjectivity_score = []
url_ids = []


for filename in tqdm(os.listdir(path_wt)):
  f = os.path.join(path_wt, filename)
  file1 = open(f, 'r')
  words = json.load(file1)
  p = 0
  n = 0
  polar_score = 0
  no_of_words = len(words)
#  print(no_of_words, filename)
  for word in words:
    if word in word_dict['Positive']:
      p += 1
    elif word in word_dict['Negative']:
      n += 1
    else:
      continue
  #print('p',(p - n)/((p + n) + 0.000001))
  url_ids.append(re.findall("\d+\.\d+", filename))
  flat_list = []
  p_score.append(p)
  n_score.append(n)
  polarity_score.append((p - n)/((p + n) + 0.000001))
  subjectivity_score.append((p + n)/((no_of_words) + 0.000001))

100%|██████████| 111/111 [00:06<00:00, 16.35it/s]


In [None]:
def flat_list(list_2d):
  flat_list = []
  for sublist in list_2d:
      for item in sublist:
        flat_list.append(item)
  return flat_list

flat_list1 = flat_list(url_ids)

In [None]:
score = {'url_id': flat_list, 'p_score': p_score, 'n_score': n_score, 
         'polarity_score': polarity_score,
         'subjectivity_score': subjectivity_score}
file1 = open('/content/drive/MyDrive/Text_EA/result/scores.json', 'w')
json.dump(score, file1)
file1.close()

### Analysis of Readability

It is calculated using the Gunning Fox index formula described below:

Average Sentence Length = the number of words / the number of sentences

Percentage of Complex words = the number of complex words / the number of words

Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

In [None]:
word_path = '/content/drive/MyDrive/Text_EA/tokens/word_tokens'
sent_path = '/content/drive/MyDrive/Text_EA/tokens/sent_tokens'

In [None]:
files = [os.listdir(word_path), os.listdir(sent_path)]

In [None]:
def count_complex_words(words_list):
  c = 0
  for word in words_list:
    l = re.findall('(?!e$)[aeiou]+', word, re.I)+re.findall('^[aeiouy]*e$', word, re.I)
    if len(l) > 2:
      c += 1
  return c

In [None]:
avg_sent_length = []
percent_of_complex_words = []
fog_index = []
url_id = []

for i in tqdm(range(len(files[0]))):
  w = os.path.join(word_path, files[0][i])
  s = os.path.join(sent_path, files[1][i])

  file1 = open(w, 'r')
  words = json.load(file1)
  file1.close()
  no_of_words = len(words)

  file2 = open(s, 'r')
  sent = json.load(file2)
  file2.close()

  url_id.append(re.findall("\d+\.\d+", files[0][i]))
  avg_sent_length.append(int(no_of_words/len(sent)))
  percent_of_complex_words.append(count_complex_words(words)/no_of_words)
  fog_index.append(0.4*(avg_sent_length[i]+percent_of_complex_words[i]))

100%|██████████| 111/111 [00:00<00:00, 169.43it/s]


In [None]:
flat_list1 = flat_list(url_id)

In [None]:
readability = {'url_id': flat_list, 'avg_sent_length': avg_sent_length,
               'percent_of_complex_words': percent_of_complex_words,
               'fog_index': fog_index}

In [None]:
file1 = open('/content/drive/MyDrive/Text_EA/result/readability_analysis.json', 'w')
json.dump(readability, file1)
file1.close()

###Average Number of Words Per Sentence
The formula for calculating is:

Average Number of Words Per Sentence = the total number of words / the total number of sentences

###Complex Word Count
Complex words are words in the text that contain more than two syllables.


In [None]:
path_extracted = '/content/drive/MyDrive/Text_EA/extracted_data'

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

avg_no_words_per_sent = []
complex_word_count = []
url_id = []

for filename in tqdm(os.listdir(path_extracted)):
  f = os.path.join(path_extracted, filename)
  with open(f, 'r') as file1:
    text = file1.read()
    file1.close()
    base, ext = os.path.splitext(filename)
    word_tokens = word_tokenize(text)
    sent_tokens = sent_tokenize(text)

    url_id.append(base)
    avg_no_words_per_sent.append(int(len(word_tokens)/len(sent_tokens)))
    complex_word_count.append(count_complex_words(word_tokens))

100%|██████████| 111/111 [00:01<00:00, 56.60it/s]


In [None]:
avg_words_and_complex_words = {'url_id': url_id, 'avg_no_words_per_sent': avg_no_words_per_sent,
                               'compplex_word_count': complex_word_count}

In [None]:
file1 = open('/content/drive/MyDrive/Text_EA/result/avg_and_complex_words.json', 'w')
json.dump(avg_words_and_complex_words, file1)
file1.close()

### Word Count
We count the total cleaned words present in the text by 
1. removing the stop words (using stopwords class of nltk package).
2. removing any punctuations like ? ! , . from the word before counting.

### Syllable Count Per Word
We count the number of Syllables in each word of the text by counting the vowels present in each word. We also handle some exceptions like words ending with "es","ed" by not counting them as a syllable.


In [None]:
def count_syllables(word):
  c = 0
  vowels = 'aeiou'
  l = re.findall(f'(?!e$)(?!es$)(?!ed$)[{vowels}]', word, re.I)
  return len(l)

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# using RegexpTokenizer to remove punctuations

tokenizer = RegexpTokenizer(r'\w+')

url_id = []
word_count = []
syllable_count = []

for filename in tqdm(os.listdir(path_extracted)):
  f = os.path.join(path_extracted, filename)
  base, ext = os.path.splitext(filename)
  with open(f, 'r') as file1:
    text = file1.read()
    file1.close()
  
  stop_words = set(stopwords.words('english'))
  words = tokenizer.tokenize(text)
  filtered_words = []
  syllable_per_word = []
  for word in words:
    if word not in stop_words:
          filtered_words.append(word)
          list1 = []
          list1.append(word)
          list1.append(count_syllables(word))
          syllable_per_word.append(list1)

  path = '/content/drive/MyDrive/Text_EA/result/syllable_per_word' + '/' + base + '_syllable_count_per_word.json'
  file1 = open(path, 'w')
  json.dump(syllable_per_word, file1)
  file1.close()

  url_id.append(base)
  word_count.append(len(filtered_words))
  syllable_sum = 0
  for i in range(len(syllable_per_word)):
    syllable_sum += syllable_per_word[i][1]
  syllable_count.append(int(syllable_sum/len(syllable_per_word)))

100%|██████████| 111/111 [00:01<00:00, 66.16it/s]


In [None]:
word_and_syllable_count = {'url_id': url_id, 'word_count': word_count, 'syllable_count': syllable_count}

In [None]:
file1 = open('/content/drive/MyDrive/Text_EA/result/word_and_syllable_count.json','w')
json.dump(word_and_syllable_count, file1)
file1.close()

### Personal Pronouns
To calculate Personal Pronouns mentioned in the text, we use regex to find the counts of the words - “I,” “we,” “my,” “ours,” and “us”. Special care is taken so that the country name US is not included in the list.

###Average Word Length
Average Word Length is calculated by the formula:
Sum of the total number of characters in each word/Total number of words


In [None]:
def count_personal_pronouns(text):
  pronoun_count = re.compile(r'\b(I|we|ours|my|mine|(?-i:us))\b', re.I)
  pronouns = pronoun_count.findall(text)
  return len(pronouns)

In [None]:
url_id = []
personal_pronouns_count = []
word_avg_length = []

for filename in tqdm(os.listdir(path_extracted)):
  f = os.path.join(path_extracted, filename)
  base, ext = os.path.splitext(filename)
  with open(f, 'r') as file1:
    text = file1.read()
    words = text.split()
    file1.close()
  
  c = 0
  for word in words:
    c += len(word)
    
  url_id.append(base)
  personal_pronouns_count.append(count_personal_pronouns(text))
  word_avg_length.append(round(c/len(words)))

In [None]:
pronouns_and_word_length = {'url_id': url_id, 'personal_pronouns': personal_pronouns_count,
                            'word_avg_length': word_avg_length}

In [None]:
with open('/content/drive/MyDrive/Text_EA/result/pronouns_and_word_length.json', 'w') as file1:
  json.dump(pronouns_and_word_length, file1)
file1.close()

##Collecting all the results and saving in the excel sheet.

In [2]:
path_result = '/content/drive/MyDrive/Text_EA/result'

In [7]:
results = {}
for filename in tqdm(os.listdir(path_result)):
  f = os.path.join(path_result, filename)
  base, ext = os.path.splitext(filename)
  try:
    with open(f, 'r') as file1:
      results[base] = json.load(file1)
  except:
    pass

100%|██████████| 6/6 [00:00<00:00,  9.85it/s]


In [45]:
df = {}

for key in results.keys():
  print(key)
  df[key] = pd.DataFrame.from_dict(results[key])

scores
readability_analysis
avg_and_complex_words
word_and_syllable_count
pronouns_and_word_length


In [26]:
all_df = pd.merge(df['scores'], df['readability_analysis'], on = 'url_id', how = 'outer')

In [29]:
all_df1 = pd.merge(df['avg_and_complex_words'], df['word_and_syllable_count'],
                  on = 'url_id',how = 'outer')

In [30]:
all_df1 = pd.merge(all_df1, df['pronouns_and_word_length'], on='url_id', how = 'outer')

In [32]:
all_df = pd.merge(all_df, all_df1, on = 'url_id', how = 'outer')

In [58]:
all_df = all_df.astype({'url_id': float})
all_df.sort_values('url_id', inplace=True)

In [83]:
column1 = list(all_df.columns)

## Importing output structure sheet and saving final result.

In [103]:
main_df = pd.read_excel('/content/drive/MyDrive/Text_EA/Output Data Structure.xlsx')

In [74]:
columns = list(main_df.columns)

In [104]:
columns.remove('URL','URL_ID')

In [84]:
column_list = [column1, columns]

In [87]:
column_dict = {}
for i in range(len(column_list[0])):
  column_dict[column_list[0][i]] = column_list[1][i]

In [90]:
all_df.rename(columns=column_dict, inplace=True)

In [106]:
main_df.drop(columns, axis=1, inplace=True)

In [107]:
data = pd.merge(main_df, all_df, on='URL_ID', how='outer')

In [110]:
data.to_excel('/content/drive/MyDrive/Text_EA/result/Final_Result.xlsx')

## Comments:
1. The syllable per word file in the result folder is showing the exact count of syllable per in a text file. Whereas the syllable count in excel sheet is average of the text file.
2. Three websites with URL_ID 44, 57 and 144 are broken.
3. Final sheet is saved in the result folder.