In [5]:
import pandas
import numpy
import requests
from bs4 import BeautifulSoup
import os
from nltk import word_tokenize


In [6]:
data = pandas.read_excel(r'Input.xlsx')


In [10]:
data

Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...
...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...


In [7]:
def scrape_articles(urls):
    articles = []
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the title
            title = soup.find('h1', class_='entry-title')
            title_text = title.get_text().strip() if title else "No title found"
            
            # Find the main content
            content = soup.find('div', class_='td-main-content')
            content_text = content.get_text().strip() if content else "No content found"
            
            articles.append({
                'title': title_text,
                'content': content_text,
                'url': url
            })
            
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            articles.append({
                'title': "Error",
                'content': str(e),
                'url': url
            })
            
    return articles




In [None]:
# Update the scraping
urls = data['URL'].tolist()
articles = scrape_articles(urls)

# Save to file with better formatting
with open('scraped_articles.txt', 'w', encoding='utf-8') as file:
    for article in articles:
        file.write(f"\nURL: {article['url']}\n")
        file.write(f"TITLE: {article['title']}\n")
        file.write(f"CONTENT:\n{article['content']}\n")
        file.write("\n" + "="*80 + "\n")  # Separator between articles

In [10]:

if not os.path.exists('extracted_articles'):
    os.makedirs('extracted_articles')


for url_id, url in zip(data['URL_ID'], data['URL']):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title = soup.find('h1', class_='entry-title')
        title_text = title.get_text().strip() if title else "No title found"
        
        # Extract main content
        content = soup.find('div', class_='td-main-content')
        content_text = content.get_text().strip() if content else "No content found"
        
        # Save to individual file
        filename = os.path.join('extracted_articles', f"{url_id}.txt")
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title_text}\n\n")
            file.write(content_text)
            
    except Exception as e:
        print(f"Error processing {url_id}: {e}")

In [13]:
import os
import re

def clean_article_text(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+\.\S+', '', text)
    
    # Remove phone numbers (various formats)
    text = re.sub(r'\+?[\d\s-]{10,}', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove common contact details sections
    contact_patterns = [
        r'Contact Details.*',
        r'Here are my contact details:.*',
        r'Firm Name:.*',
        r'Firm Website:.*', 
        r'Firm Address:.*',
        r'Email:.*',
        r'Skype:.*',
        r'WhatsApp:.*',
        r'Telegram:.*'
    ]
    
    for pattern in contact_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE|re.DOTALL)
    
    # Remove multiple newlines and extra whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()




Finished cleaning all articles


In [None]:

articles_dir = 'extracted_articles'
for filename in os.listdir(articles_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(articles_dir, filename)
        
     
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
  
        cleaned_content = clean_article_text(content)
        

        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

print("Finished cleaning all articles")

In [16]:

output_dir = 'cleaned_articles'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

articles_dir = 'extracted_articles'
for filename in os.listdir(articles_dir):
    if filename.endswith('.txt'):
        input_path = os.path.join(articles_dir, filename)
        output_path = os.path.join(output_dir, filename)
        
        # Read the file
        with open(input_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
    
        cleaned_content = clean_article_text(content)
        

        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

print(f"Finished cleaning all articles. Cleaned files saved in {output_dir}/")


Finished cleaning all articles. Cleaned files saved in cleaned_articles/


### Sentiment Analysis

1.1 Using Stop Words

In [20]:
# Define paths
cleaned_articles_dir = 'cleaned_articles'
stop_words_dir = 'StopWords'

# Load all stop words from the StopWords directory
stop_words = set()

# List of stop word files to process
stop_word_files = [
    'StopWords_Auditor.txt',
    'StopWords_DatesandNumbers.txt', 
    'StopWords_Generic.txt',
    'StopWords_GenericLong.txt',
    'StopWords_Geographic.txt',
    'StopWords_Names.txt'
]


for filename in stop_word_files:
    try:
        file_path = os.path.join(stop_words_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Add each word from the file to the stop words set
            # Strip whitespace and remove any text after | character
            for line in file:
                word = line.split('|')[0].strip()
                if word:  # Only add non-empty strings
                    stop_words.add(word.lower())  # Convert to lowercase for case-insensitive matching
    except FileNotFoundError:
        print(f"Warning: Stop words file {filename} not found")
        continue
    except UnicodeDecodeError:
        # Try with a different encoding if utf-8 fails
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                for line in file:
                    word = line.split('|')[0].strip()
                    if word:
                        stop_words.add(word.lower())
        except:
            print(f"Warning: Could not read {filename} with either utf-8 or latin-1 encoding")
            continue

print(f"Loaded {len(stop_words)} stop words")



Loaded 12676 stop words
Finished removing stop words from all articles


In [None]:
def remove_stop_words(text):
    """Remove stop words from text"""
 
    words = text.split()
   
    filtered_words = [word for word in words if word.lower() not in stop_words]

    return ' '.join(filtered_words)

# Process all files in cleaned_articles directory to remove stop words
for filename in os.listdir(cleaned_articles_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(cleaned_articles_dir, filename)
        
        try:
        
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
        
            filtered_content = remove_stop_words(content)
            

            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(filtered_content)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("Finished removing stop words from all articles")


In [21]:

cleaned_no_stopwords_dir = 'cleaned_articles_no_stopwords'
if not os.path.exists(cleaned_no_stopwords_dir):
    os.makedirs(cleaned_no_stopwords_dir)


for filename in os.listdir(cleaned_articles_dir):
    if filename.endswith('.txt'):
        src_path = os.path.join(cleaned_articles_dir, filename)
        dst_path = os.path.join(cleaned_no_stopwords_dir, filename)
        
        try:
        
            with open(src_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            
            filtered_content = remove_stop_words(content)
            
          
            with open(dst_path, 'w', encoding='utf-8') as file:
                file.write(filtered_content)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print(f"Finished saving cleaned articles without stop words to {cleaned_no_stopwords_dir}")


Finished saving cleaned articles without stop words to cleaned_articles_no_stopwords


### 1.2 Master Dictionary

### Scores - Positive , Negative , Polarity

In [20]:
import nltk
from nltk.tokenize import word_tokenize
import os
nltk.download('punkt_tab')


def read_word_list(file_path):
    # Try UTF-8 first
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            words = [line.strip() for line in file if line.strip() and not line.startswith(';')]
        return set(words)
    except UnicodeDecodeError:
      
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                words = [line.strip() for line in file if line.strip() and not line.startswith(';')]
            return set(words)
        except Exception as e:
            print(f"Error reading file {file_path}: {str(e)}")
            return set()

# Read positive and negative word lists
positive_words = read_word_list('MasterDictionary/positive-words.txt')
negative_words = read_word_list('MasterDictionary/negative-words.txt')

def calculate_sentiment_scores(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    
    # Calculate scores
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    negative_score = negative_score * -1  # Convert to positive number
    
    # Calculate polarity score
    denominator = (positive_score + abs(negative_score)) + 0.000001
    polarity_score = (positive_score - abs(negative_score)) / denominator
    
    # Calculate subjectivity score
    total_words = len(tokens)
    subjectivity_score = (positive_score + abs(negative_score)) / (total_words + 0.000001)
    
    return {
        'Positive_Score': positive_score,
        'Negative_Score': abs(negative_score),  # Return absolute value
        'Polarity_Score': polarity_score,
        'Subjectivity_Score': subjectivity_score
    }


sentiment_scores = {}
cleaned_dir = 'cleaned_articles_no_stopwords'

for filename in os.listdir(cleaned_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(cleaned_dir, filename)
        
        try:
   
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
            except UnicodeDecodeError:
                # If UTF-8 fails, try latin-1
                with open(file_path, 'r', encoding='latin-1') as file:
                    text = file.read()
        
            scores = calculate_sentiment_scores(text)
            sentiment_scores[filename] = scores
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("Finished calculating sentiment scores for all articles")

# Print example scores


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jayantkrishnasingh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Finished calculating sentiment scores for all articles

Scores for Netclan20241024.txt:
Positive Score: 10
Negative Score: 3
Polarity Score: 0.538
Subjectivity Score: 0.045

Scores for Netclan20241030.txt:
Positive Score: 13
Negative Score: 6
Polarity Score: 0.368
Subjectivity Score: 0.059

Scores for Netclan20241018.txt:
Positive Score: 8
Negative Score: 7
Polarity Score: 0.067
Subjectivity Score: 0.032

Scores for Netclan20241150.txt:
Positive Score: 4
Negative Score: 4
Polarity Score: 0.000
Subjectivity Score: 0.041

Scores for Netclan20241144.txt:
Positive Score: 7
Negative Score: 3
Polarity Score: 0.400
Subjectivity Score: 0.050


In [25]:

urls_df = pd.read_excel('Input.xlsx')


merged_df = pd.merge(scores_df, urls_df[['URL_ID', 'URL']], on='URL_ID', how='left')


merged_df.to_csv('sentiment_scores.csv', index=False)

print("Added URLs to sentiment scores CSV file")


Added URLs to sentiment scores CSV file


In [21]:
for filename, scores in list(sentiment_scores.items())[:50]:  # Print first 5 articles' scores
    print(f"\nScores for {filename}:")
    print(f"Positive Score: {scores['Positive_Score']}")
    print(f"Negative Score: {scores['Negative_Score']}")
    print(f"Polarity Score: {scores['Polarity_Score']:.3f}")
    print(f"Subjectivity Score: {scores['Subjectivity_Score']:.3f}")


Scores for Netclan20241024.txt:
Positive Score: 10
Negative Score: 3
Polarity Score: 0.538
Subjectivity Score: 0.045

Scores for Netclan20241030.txt:
Positive Score: 13
Negative Score: 6
Polarity Score: 0.368
Subjectivity Score: 0.059

Scores for Netclan20241018.txt:
Positive Score: 8
Negative Score: 7
Polarity Score: 0.067
Subjectivity Score: 0.032

Scores for Netclan20241150.txt:
Positive Score: 4
Negative Score: 4
Polarity Score: 0.000
Subjectivity Score: 0.041

Scores for Netclan20241144.txt:
Positive Score: 7
Negative Score: 3
Polarity Score: 0.400
Subjectivity Score: 0.050

Scores for Netclan20241145.txt:
Positive Score: 14
Negative Score: 2
Polarity Score: 0.750
Subjectivity Score: 0.054

Scores for Netclan20241151.txt:
Positive Score: 4
Negative Score: 1
Polarity Score: 0.600
Subjectivity Score: 0.030

Scores for Netclan20241019.txt:
Positive Score: 10
Negative Score: 3
Polarity Score: 0.538
Subjectivity Score: 0.045

Scores for Netclan20241031.txt:
Positive Score: 56
Negative

## Analysis of Readability

In [28]:
def calculate_readability_scores(text):

    sentences = nltk.sent_tokenize(text)
    

    words = [word.lower() for word in nltk.word_tokenize(text) 
             if word.isalnum()]
    

    num_words = len(words)
    num_sentences = len(sentences)
    

    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    
    
    complex_words = 0
    for word in words:
        syllable_count = 0
        for i in range(len(word)):
            if i == 0 and word[i] in 'aeiou':
                syllable_count += 1
            elif i > 0 and word[i] in 'aeiou' and word[i-1] not in 'aeiou':
                syllable_count += 1
        if word.endswith('es') or word.endswith('ed'):
            syllable_count -= 1
        if syllable_count >= 3:
            complex_words += 1

    percent_complex = (complex_words / num_words * 100) if num_words > 0 else 0
    

    fog_index = 0.4 * (avg_sentence_length + percent_complex)
    
    return {
        'avg_sentence_length': avg_sentence_length,
        'percent_complex_words': percent_complex,
        'fog_index': fog_index
    }


readability_scores = {}


article_files = glob.glob('cleaned_articles_no_stopwords/*.txt')

for article_file in article_files:
    try:
        with open(article_file, 'r', encoding='utf-8') as f:
            text = f.read()
            if text.strip():  # Skip empty files
                scores = calculate_readability_scores(text)
                # Use filename without extension as identifier
                article_id = os.path.splitext(os.path.basename(article_file))[0]
                readability_scores[article_id] = scores
                
    except Exception as e:
        print(f"Error processing {article_file}: {str(e)}")
        continue

print("Finished calculating readability scores for all articles")




Finished calculating readability scores for all articles


In [29]:

readability_df = pd.DataFrame.from_dict(readability_scores, orient='index')


readability_df = readability_df.reset_index()
readability_df = readability_df.rename(columns={'index': 'URL_ID'})


readability_df.to_csv('readability_scores.csv', index=False)

print("Saved readability scores to readability_scores.csv")


Saved readability scores to readability_scores.csv


In [30]:

readability_df = pd.read_csv('readability_scores.csv')
sentiment_df = pd.read_csv('sentiment_scores.csv')


merged_df = pd.merge(readability_df, sentiment_df, on='URL_ID')


merged_df.to_csv('merged_scores.csv', index=False)

print("Saved merged scores to merged_scores.csv")


Saved merged scores to merged_scores.csv


In [33]:

avg_words_per_sentence = {}

for article_file in glob.glob('cleaned_articles_no_stopwords/*'):
    try:
        with open(article_file, 'r', encoding='utf-8') as f:
            text = f.read()
            
            if text.strip():  # Skip empty files
            
                sentences = nltk.sent_tokenize(text)
                
                # Count total words
                words = text.split()
                total_words = len(words)
                
                # Count sentences
                num_sentences = len(sentences)
                
    
                if num_sentences > 0:  # Avoid division by zero
                    avg_words = total_words / num_sentences
                else:
                    avg_words = 0
            
                article_id = os.path.splitext(os.path.basename(article_file))[0]
                avg_words_per_sentence[article_id] = avg_words
                
    except Exception as e:
        print(f"Error processing {article_file}: {str(e)}")
        continue

print("Finished calculating average words per sentence for all articles")

# Convert to DataFrame
avg_words_df = pd.DataFrame.from_dict(avg_words_per_sentence, orient='index', 
                                    columns=['avg_words_per_sentence'])
avg_words_df = avg_words_df.reset_index()
avg_words_df = avg_words_df.rename(columns={'index': 'URL_ID'})

# Save to CSV
avg_words_df.to_csv('avg_words_per_sentence.csv', index=False)
print("Saved average words per sentence to avg_words_per_sentence.csv")


Finished calculating average words per sentence for all articles
Saved average words per sentence to avg_words_per_sentence.csv


In [34]:

avg_words_df = pd.read_csv('avg_words_per_sentence.csv')
sentiment_scores_df = pd.read_csv('sentiment_scores.csv')


merged_df = pd.merge(sentiment_scores_df, avg_words_df, on='URL_ID', how='inner')

# Save merged dataframe to CSV
merged_df.to_csv('merged_sentiment_and_words.csv', index=False)
print("Saved merged data to merged_sentiment_and_words.csv")


Saved merged data to merged_sentiment_and_words.csv


In [35]:

avg_word_lengths = {}

for article_file in article_files:
    try:
        with open(article_file, 'r', encoding='utf-8') as f:
            text = f.read()
            

            words = word_tokenize(text)
            
        
            words = [word for word in words if word.isalnum()]
            
            if words:  # Only calculate if there are valid words
          
                total_chars = sum(len(word) for word in words)
                avg_word_length = total_chars / len(words)
                

                article_id = os.path.splitext(os.path.basename(article_file))[0]
                avg_word_lengths[article_id] = avg_word_length
                
    except Exception as e:
        print(f"Error processing {article_file}: {str(e)}")
        continue

print("Finished calculating average word length for all articles")


avg_word_length_df = pd.DataFrame.from_dict(avg_word_lengths, orient='index', 
                                          columns=['avg_word_length'])
avg_word_length_df = avg_word_length_df.reset_index()
avg_word_length_df = avg_word_length_df.rename(columns={'index': 'URL_ID'})


avg_word_length_df.to_csv('avg_word_length.csv', index=False)
print("Saved average word length to avg_word_length.csv")


Finished calculating average word length for all articles
Saved average word length to avg_word_length.csv


In [36]:

readability_df = pd.read_csv('readability_scores.csv')
sentiment_df = pd.read_csv('sentiment_scores.csv')
avg_word_length_df = pd.read_csv('avg_word_length.csv')


merged_scores = readability_df.merge(sentiment_df, on='URL_ID', how='outer')
merged_scores = merged_scores.merge(avg_word_length_df, on='URL_ID', how='outer')


merged_scores.to_csv('merged_scores.csv', index=False)
print("Saved final merged scores to merged_scores.csv")


Saved final merged scores to merged_scores.csv


In [37]:

os.rename('merged_scores.csv', 'Final Result.csv')
print("Renamed merged_scores.csv to Final Result.csv")


Renamed merged_scores.csv to Final Result.csv
