In [1]:
import numpy as np
import pandas as pd
import requests
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer

In [2]:
# 1. Extract list of urls from input file
input_files_url='https://docs.google.com/spreadsheets/d/1fBx_dmkWVias5UVBIJw_zGdh46GcHcEB/edit#gid=959784854'
input_files_url= input_files_url.replace('/edit#gid=', '/export?format=csv&gid=')
df=pd.read_csv(input_files_url)
url_list = df[df.columns[1]].values.tolist()

In [3]:
# 2. Selenium Setup
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
from selenium.webdriver.common.by import By

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=chrome_options)

# # Loop to be added for every url
# driver.get(url_list[0])
# article_title=driver.find_element(By.TAG_NAME,"h1").text
# article_body=driver.find_element(By.CLASS_NAME, "td-post-content").text

In [4]:
## 3. Uncomment to create article files for further analysis if not exist inside an articles folder

# for i in df.index:
#     driver.get(df['URL'][i])
#     article_title=driver.find_element(By.TAG_NAME,"h1").text
#     article_body=driver.find_element(By.CLASS_NAME, "td-post-content").text
#     filename='articles/'+str(df['URL_ID'][i])+'.txt'
#     with open(filename,"w+") as text_file:
#         print('Writing File '+str(df['URL_ID'][i]))
#         text_file.write(article_title+'\n'+article_body)
driver.quit()

## Textual Analysis

In [5]:
## 4. Creating list of StopWords and combining them all
swlist=[]
with open('StopWords/StopWords_Names.txt') as file: # Names
    sw_names=word_tokenize(file.read())
    for i in sw_names:
        if i.isupper():
            swlist.append(i)
            
with open('StopWords/StopWords_Geographic.txt') as file: # Geographic
    sw_geo=word_tokenize(file.read())
    for i in sw_geo:
        if i.isupper():
            swlist.append(i)
            
with open('StopWords/StopWords_GenericLong.txt') as file: # Generic Long
    sw_genl=word_tokenize(file.read())
    swlist.extend(sw_genl)

with open('StopWords/StopWords_Generic.txt') as file: # Generic
    sw_gen=word_tokenize(file.read())
    swlist.extend(sw_gen)

with open('StopWords/StopWords_DatesandNumbers.txt') as file: # Dates
    sw_dn=word_tokenize(file.read())
    for i in sw_dn:
        if i.isupper():
            swlist.append(i)

# For currencies need to use iso encoding to not run into unicodedecode error
with open('StopWords/StopWords_Currencies.txt',encoding='iso-8859-15') as file: # Currencies
    tokenizer = RegexpTokenizer('\w+')
    sw_curr=tokenizer.tokenize(file.read())
    for i in sw_curr:
        if i.isalnum():
            swlist.append(i)
            
with open('StopWords/StopWords_Auditor.txt') as file: # Auditor
    sw_auditor=word_tokenize(file.read())
    for i in sw_auditor:
        if i.isupper():
            swlist.append(i)
            
sw_lower=[x.lower() for x in swlist]
# print((sw_lower[:10]))

In [6]:
# 5. Fetching Master Dictionary
pos_words=[]
with open('MasterDictionary/positive-words.txt') as file: # positive Words
    pos_words=word_tokenize(file.read())

neg_words=[]
with open('MasterDictionary/negative-words.txt',encoding='iso-8859-15') as file: # negative Words
    neg_words=word_tokenize(file.read())


### Helper Functions

In [7]:
def get_article_from_id(url_id):    
    with open('articles/'+str(url_id)+".txt") as file:
        article_text=file.read()
    return article_text

In [8]:
def standard_tokenizer(article):
    tk = WhitespaceTokenizer()
    punc="!#$%&'()*+,-./:;<=>?@[\]^_`{|}~ "
    tokens=tk.tokenize(article)
    std_tokens=[word.strip(punc) for word in tokens]
    return std_tokens

In [9]:
def clean_text_tokenizer(article):
    article_tokens=standard_tokenizer(article)
    clean_text=[]
    for i in article_tokens:
        target = i.lower()
        if target in sw_lower:
            pass
        else:
            clean_text.append(i)
    return clean_text

In [10]:
# def clean_text_tokenizer2(article):
#     article_tokens=word_tokenize(article)
#     clean_text=[]
#     for i in article_tokens:
#         target = i.lower()
#         if target in sw_lower:
#             pass
#         else:
#             if target.isalnum():
#                 clean_text.append(i)
#     return clean_text

In [11]:
def scores_calc(clean_text,pos_words,neg_words):
    # Positive and Negative Scores
    pos_score,neg_score=0,0
    for i in clean_text:
        if i.lower() in pos_words:
            pos_score+=1
        if i.lower() in neg_words:
            neg_score+=1
            
    # Polarity     
    polarity=(pos_score-neg_score)/(neg_score+pos_score+0.000001)
    
    # Subjectivity Score
    num_words=len(clean_text)
    subj_score=(pos_score+neg_score)/(num_words+0.000001)
    
    return pos_score, neg_score, polarity, subj_score

### 2. Analysis of Readability

In [12]:
def avg_sen_len(article,std_text):
    num_sent=len(sent_tokenize(article))
    num_words=len(std_text)
    asl=num_words/num_sent
    return asl

In [13]:
def per_comp_words(std_text):
    vowels = "AaEeIiOoUu"
    complex_count=0
    for word in std_text:
        target=word
        vcount=0
        if word.endswith('es') or word.endswith('ed'):
            target=word[:-2]
        for alphabet in target:
            # If alphabet is a vowel
            if alphabet in vowels:
                vcount += 1
        if vcount>2:
            complex_count+=1
    #         print(target)
    num_words=len(std_text)
    perc=complex_count/num_words*100
    return perc,complex_count

In [14]:
def fog_index(article,std_text):
    asl=avg_sen_len(article,std_text)
    perc,complex_count=per_comp_words(std_text)
    fogi=0.4*(asl+perc)
    return fogi

In [15]:
def avg_num_words(article,std_text):
    return avg_sen_len(article,std_text)

In [16]:
def clean_word_count(clean_text):
    return len(clean_text)

In [17]:
def avg_syl_count(std_text):
    vowels = "AaEeIiOoUu"
    total_vcount=0
    for word in std_text:
        target=word
        vcount=0
        if word.endswith('es') or word.endswith('ed'):
            target=word[:-2]
        for alphabet in target:
            # If alphabet is a vowel
            if alphabet in vowels:
                vcount += 1
        total_vcount+=vcount
    #         print(target)
    num_words=len(std_text)
    asc=total_vcount/num_words
    return asc

In [18]:
def pers_pro_count(std_text):
    p_count=0
    pronouns=['i','we','my','ours','us']
    for i in std_text:
        if i=='US':
            continue
        target=i.lower()
        if target in pronouns:
            p_count+=1
    return p_count

In [19]:
def avg_word_len(std_text):
    num_char=0
    for i in std_text:
        num_char+=len(i)
    num_words=len(std_text)
    awl=num_char/num_words
    return awl

## Analysis of all the 150 articles

In [35]:
output_files_url='https://docs.google.com/spreadsheets/d/1GQ_akhFuLyDYob1y7m_wAEmR9-hSr89q/edit#gid=2010199760'
output_files_url= output_files_url.replace('/edit#gid=', '/export?format=csv&gid=')
out_df=pd.read_csv(output_files_url)


In [36]:
for i in out_df.index:
    
    url_id=out_df['URL_ID'][i]
    print('Processing article: '+str(url_id))
    article=get_article_from_id(url_id)
    std_text=standard_tokenizer(article)
    clean_text=clean_text_tokenizer(article)
    
    out_df[out_df.columns[2]][i], out_df[out_df.columns[3]][i], out_df[out_df.columns[4]][i], out_df[out_df.columns[5]][i]=scores_calc(clean_text,pos_words,neg_words)
    out_df[out_df.columns[6]][i] = avg_sen_len(article,std_text)
    out_df[out_df.columns[7]][i], out_df[out_df.columns[10]][i] = per_comp_words(std_text)
    out_df[out_df.columns[8]][i] = fog_index(article,std_text)
    out_df[out_df.columns[9]][i] = avg_num_words(article,std_text)
    out_df[out_df.columns[11]][i] = clean_word_count(clean_text)
    out_df[out_df.columns[12]][i] = avg_syl_count(std_text)
    out_df[out_df.columns[13]][i] = pers_pro_count(std_text)
    out_df[out_df.columns[14]][i] = avg_word_len(std_text)

Processing article: 1
Processing article: 2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_df[out_df.columns[2]][i], out_df[out_df.columns[3]][i], out_df[out_df.columns[4]][i], out_df[out_df.columns[5]][i]=scores_calc(clean_text,pos_words,neg_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_df[out_df.columns[6]][i] = avg_sen_len(article,std_text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_df[out_df.columns[7]][i], out_df[out_df.columns[10]][i] = per_comp_words(std_text)
A value is trying to be set on a copy of a slice from a DataFrame

Processing article: 3
Processing article: 4
Processing article: 5
Processing article: 6
Processing article: 7
Processing article: 8
Processing article: 9
Processing article: 10
Processing article: 11
Processing article: 12
Processing article: 13
Processing article: 14
Processing article: 15
Processing article: 16
Processing article: 17
Processing article: 18
Processing article: 19
Processing article: 20
Processing article: 21
Processing article: 22
Processing article: 23
Processing article: 24
Processing article: 25
Processing article: 26
Processing article: 27
Processing article: 28
Processing article: 29
Processing article: 30
Processing article: 31
Processing article: 32
Processing article: 33
Processing article: 34
Processing article: 35
Processing article: 36
Processing article: 37
Processing article: 38
Processing article: 39
Processing article: 40
Processing article: 41
Processing article: 42
Processing article: 43
Processing article: 44
Processing article: 45
Processing article

In [31]:
out_df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,1,https://insights.blackcoffer.com/is-telehealth...,30.0,8.0,0.578947,0.10951,26.692308,33.14121,23.933407,26.692308,230.0,347.0,2.04755,0.0,5.5317
1,2,https://insights.blackcoffer.com/how-telehealt...,33.0,16.0,0.346939,0.124365,17.944444,26.21259,17.662814,17.944444,254.0,394.0,1.868937,0.0,5.024768
2,3,https://insights.blackcoffer.com/is-telemedici...,82.0,28.0,0.490909,0.122631,18.755319,33.635848,20.956467,18.755319,593.0,897.0,2.167896,1.0,5.667045
3,4,https://insights.blackcoffer.com/is-telehealth...,54.0,22.0,0.421053,0.079581,20.482759,34.792368,22.110051,20.482759,620.0,955.0,2.159933,0.0,5.748036
4,5,https://insights.blackcoffer.com/how-people-di...,76.0,33.0,0.394495,0.120843,22.654321,31.06267,21.486797,22.654321,570.0,902.0,2.088283,1.0,5.580926


In [30]:
out_df.tail()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
145,146,https://insights.blackcoffer.com/blockchain-fo...,22.0,26.0,-0.083333,0.111111,18.693878,26.310044,18.001568,18.693878,241.0,432.0,1.909389,9.0,5.398472
146,147,https://insights.blackcoffer.com/the-future-of...,37.0,12.0,0.510204,0.065596,22.652174,26.359565,19.604696,22.652174,412.0,747.0,1.887396,2.0,5.150992
147,148,https://insights.blackcoffer.com/big-data-anal...,28.0,45.0,-0.232877,0.122896,17.590909,29.371232,18.784856,17.590909,341.0,594.0,1.932817,2.0,5.118863
148,149,https://insights.blackcoffer.com/business-anal...,32.0,4.0,0.777778,0.094737,24.166667,33.103448,22.908046,24.166667,240.0,380.0,2.088276,0.0,5.702069
149,150,https://insights.blackcoffer.com/challenges-an...,31.0,38.0,-0.101449,0.138833,15.969697,23.529412,15.799643,15.969697,248.0,497.0,1.889943,8.0,5.044592


In [32]:
out_df.to_csv('output.csv',index=False)

In [34]:
pd_object = pd.ExcelWriter('output_excel.xlsx') # requires openpyxl
out_df.to_excel(pd_object, index=False)
 
pd_object.save()