In [24]:
import pandas
import numpy
import requests
from bs4 import BeautifulSoup
import os
from nltk import word_tokenize


In [6]:
data = pandas.read_excel(r'Input.xlsx')


In [10]:
data

Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...
...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...


In [7]:
def scrape_articles(urls):
    articles = []
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the title
            title = soup.find('h1', class_='entry-title')
            title_text = title.get_text().strip() if title else "No title found"
            
            # Find the main content
            content = soup.find('div', class_='td-main-content')
            content_text = content.get_text().strip() if content else "No content found"
            
            articles.append({
                'title': title_text,
                'content': content_text,
                'url': url
            })
            
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            articles.append({
                'title': "Error",
                'content': str(e),
                'url': url
            })
            
    return articles

# Update the scraping
urls = data['URL'].tolist()
articles = scrape_articles(urls)

# Save to file with better formatting
with open('scraped_articles.txt', 'w', encoding='utf-8') as file:
    for article in articles:
        file.write(f"\nURL: {article['url']}\n")
        file.write(f"TITLE: {article['title']}\n")
        file.write(f"CONTENT:\n{article['content']}\n")
        file.write("\n" + "="*80 + "\n")  # Separator between articles


In [10]:
# Create a directory to store individual article files
if not os.path.exists('extracted_articles'):
    os.makedirs('extracted_articles')

# Extract and save each article to individual files
for url_id, url in zip(data['URL_ID'], data['URL']):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title = soup.find('h1', class_='entry-title')
        title_text = title.get_text().strip() if title else "No title found"
        
        # Extract main content
        content = soup.find('div', class_='td-main-content')
        content_text = content.get_text().strip() if content else "No content found"
        
        # Save to individual file
        filename = os.path.join('extracted_articles', f"{url_id}.txt")
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title_text}\n\n")
            file.write(content_text)
            
    except Exception as e:
        print(f"Error processing {url_id}: {e}")

In [13]:
import os
import re

def clean_article_text(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+\.\S+', '', text)
    
    # Remove phone numbers (various formats)
    text = re.sub(r'\+?[\d\s-]{10,}', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove common contact details sections
    contact_patterns = [
        r'Contact Details.*',
        r'Here are my contact details:.*',
        r'Firm Name:.*',
        r'Firm Website:.*', 
        r'Firm Address:.*',
        r'Email:.*',
        r'Skype:.*',
        r'WhatsApp:.*',
        r'Telegram:.*'
    ]
    
    for pattern in contact_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE|re.DOTALL)
    
    # Remove multiple newlines and extra whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Process all files in extracted_articles directory
articles_dir = 'extracted_articles'
for filename in os.listdir(articles_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(articles_dir, filename)
        
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Clean the content
        cleaned_content = clean_article_text(content)
        
        # Write back the cleaned content
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

print("Finished cleaning all articles")


Finished cleaning all articles


In [16]:
# Create output directory if it doesn't exist
output_dir = 'cleaned_articles'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process all files in extracted_articles directory and save to cleaned_articles
articles_dir = 'extracted_articles'
for filename in os.listdir(articles_dir):
    if filename.endswith('.txt'):
        input_path = os.path.join(articles_dir, filename)
        output_path = os.path.join(output_dir, filename)
        
        # Read the file
        with open(input_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Clean the content
        cleaned_content = clean_article_text(content)
        
        # Write cleaned content to new location
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

print(f"Finished cleaning all articles. Cleaned files saved in {output_dir}/")


Finished cleaning all articles. Cleaned files saved in cleaned_articles/


### Sentiment Analysis

1.1 Using Stop Words

In [20]:
# Define paths
cleaned_articles_dir = 'cleaned_articles'
stop_words_dir = 'StopWords'

# Load all stop words from the StopWords directory
stop_words = set()

# List of stop word files to process
stop_word_files = [
    'StopWords_Auditor.txt',
    'StopWords_DatesandNumbers.txt', 
    'StopWords_Generic.txt',
    'StopWords_GenericLong.txt',
    'StopWords_Geographic.txt',
    'StopWords_Names.txt'
]

# Load stop words from each file
for filename in stop_word_files:
    try:
        file_path = os.path.join(stop_words_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Add each word from the file to the stop words set
            # Strip whitespace and remove any text after | character
            for line in file:
                word = line.split('|')[0].strip()
                if word:  # Only add non-empty strings
                    stop_words.add(word.lower())  # Convert to lowercase for case-insensitive matching
    except FileNotFoundError:
        print(f"Warning: Stop words file {filename} not found")
        continue
    except UnicodeDecodeError:
        # Try with a different encoding if utf-8 fails
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                for line in file:
                    word = line.split('|')[0].strip()
                    if word:
                        stop_words.add(word.lower())
        except:
            print(f"Warning: Could not read {filename} with either utf-8 or latin-1 encoding")
            continue

print(f"Loaded {len(stop_words)} stop words")

def remove_stop_words(text):
    """Remove stop words from text"""
    # Split text into words
    words = text.split()
    # Remove stop words (case insensitive)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Rejoin words
    return ' '.join(filtered_words)

# Process all files in cleaned_articles directory to remove stop words
for filename in os.listdir(cleaned_articles_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(cleaned_articles_dir, filename)
        
        try:
            # Read the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Remove stop words
            filtered_content = remove_stop_words(content)
            
            # Write back the filtered content
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(filtered_content)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("Finished removing stop words from all articles")


Loaded 12676 stop words
Finished removing stop words from all articles


In [21]:
# Create a directory for storing cleaned articles without stop words
cleaned_no_stopwords_dir = 'cleaned_articles_no_stopwords'
if not os.path.exists(cleaned_no_stopwords_dir):
    os.makedirs(cleaned_no_stopwords_dir)

# Copy cleaned articles to new directory and remove stop words
for filename in os.listdir(cleaned_articles_dir):
    if filename.endswith('.txt'):
        src_path = os.path.join(cleaned_articles_dir, filename)
        dst_path = os.path.join(cleaned_no_stopwords_dir, filename)
        
        try:
            # Read the source file
            with open(src_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Remove stop words
            filtered_content = remove_stop_words(content)
            
            # Write to destination file
            with open(dst_path, 'w', encoding='utf-8') as file:
                file.write(filtered_content)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print(f"Finished saving cleaned articles without stop words to {cleaned_no_stopwords_dir}")


Finished saving cleaned articles without stop words to cleaned_articles_no_stopwords


### 1.2 Master Dictionary

In [25]:
# # Read positive and negative words from master dictionary
# positive_words = set()
# negative_words = set()

# # Read positive words
# with open('MasterDictionary/positive-words.txt', 'r', encoding='utf-8') as file:
#     for line in file:
#         word = line.strip()
#         if word and not word.startswith(';'):  # Skip comments and empty lines
#             positive_words.add(word)

# # Read negative words            
# with open('MasterDictionary/negative-words.txt', 'r', encoding='utf-8') as file:
#     for line in file:
#         word = line.strip()
#         if word and not word.startswith(';'):  # Skip comments and empty lines
#             negative_words.add(word)

# # Remove any stop words from the sentiment dictionaries
# positive_words = positive_words - stop_words
# negative_words = negative_words - stop_words

# def calculate_sentiment_scores(text):
#     # Tokenize the text
    
    
#     tokens = word_tokenize(text.lower())
    
#     # Calculate scores
#     positive_score = sum(1 for word in tokens if word in positive_words)
#     negative_score = -1 * sum(1 for word in tokens if word in negative_words)
    
#     # Calculate polarity score
#     polarity_score = (positive_score - negative_score) / ((positive_score + abs(negative_score)) + 0.000001)
    
#     # Calculate subjectivity score
#     total_words = len(tokens)
#     subjectivity_score = (positive_score + abs(negative_score)) / (total_words + 0.000001)
    
#     return {
#         'positive_score': positive_score,
#         'negative_score': abs(negative_score),  # Return absolute value
#         'polarity_score': polarity_score,
#         'subjectivity_score': subjectivity_score
#     }

# print("Loaded sentiment dictionaries and created scoring functions")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xef in position 3988: invalid continuation byte

In [27]:
# Read positive and negative words from master dictionary
positive_words = set()
negative_words = set()

def read_dictionary_file(file_path):
    words = set()
    # Try different encodings
    encodings = ['utf-8', 'latin-1', 'utf-16', 'cp1252']
    
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                for line in file:
                    word = line.strip()
                    if word and not word.startswith(';'):  # Skip comments and empty lines
                        words.add(word)
            print(f"Successfully read {file_path} with {encoding} encoding")
            break  # If successful, exit the loop
        except UnicodeDecodeError:
            continue  # Try next encoding
        except Exception as e:
            print(f"Error reading {file_path}: {str(e)}")
            break
    return words

# Read dictionaries
positive_words = read_dictionary_file('MasterDictionary/positive-words.txt')
negative_words = read_dictionary_file('MasterDictionary/negative-words.txt')

print(f"Loaded {len(positive_words)} positive words and {len(negative_words)} negative words")

# Remove any stop words from the sentiment dictionaries
positive_words = positive_words - stop_words
negative_words = negative_words - stop_words

def calculate_sentiment_scores(text):
    """Calculate various sentiment scores for the given text"""
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Calculate scores
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = -1 * sum(1 for word in tokens if word in negative_words)
    
    # Calculate polarity score
    polarity_score = (positive_score - negative_score) / ((positive_score + abs(negative_score)) + 0.000001)
    
    # Calculate subjectivity score
    total_words = len(tokens)
    subjectivity_score = (positive_score + abs(negative_score)) / (total_words + 0.000001)
    
    return {
        'positive_score': positive_score,
        'negative_score': abs(negative_score),  # Return absolute value
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score
    }

# Process all files in cleaned_articles_no_stopwords directory
results = []
cleaned_no_stopwords_dir = 'cleaned_articles_no_stopwords'

for filename in os.listdir(cleaned_no_stopwords_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(cleaned_no_stopwords_dir, filename)
        url_id = filename.replace('.txt', '')
        
        try:
            # Read the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Calculate sentiment scores
            scores = calculate_sentiment_scores(content)
            
            # Add URL_ID to results
            scores['URL_ID'] = url_id
            results.append(scores)
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

# Convert results to DataFrame
results_df = pandas.DataFrame(results)

# Ensure URL_ID is the first column
cols = ['URL_ID'] + [col for col in results_df.columns if col != 'URL_ID']
results_df = results_df[cols]

# Save results to Excel
results_df.to_excel('sentiment_scores.xlsx', index=False)
print("Sentiment analysis complete. Results saved to sentiment_scores.xlsx")

Successfully read MasterDictionary/positive-words.txt with utf-8 encoding
Successfully read MasterDictionary/negative-words.txt with latin-1 encoding
Loaded 2006 positive words and 4783 negative words
Error processing Netclan20241017.txt: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/psyk/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

Error processing Netclan20241018.txt: 
*************************************************

KeyError: "None of [Index(['URL_ID'], dtype='object')] are in the [columns]"