In [5]:
import pandas
import numpy
import requests
from bs4 import BeautifulSoup
import os
from nltk import word_tokenize


In [6]:
data = pandas.read_excel(r'Input.xlsx')


In [10]:
data

Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...
...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...


In [7]:
def scrape_articles(urls):
    articles = []
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the title
            title = soup.find('h1', class_='entry-title')
            title_text = title.get_text().strip() if title else "No title found"
            
            # Find the main content
            content = soup.find('div', class_='td-main-content')
            content_text = content.get_text().strip() if content else "No content found"
            
            articles.append({
                'title': title_text,
                'content': content_text,
                'url': url
            })
            
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            articles.append({
                'title': "Error",
                'content': str(e),
                'url': url
            })
            
    return articles

# Update the scraping
urls = data['URL'].tolist()
articles = scrape_articles(urls)

# Save to file with better formatting
with open('scraped_articles.txt', 'w', encoding='utf-8') as file:
    for article in articles:
        file.write(f"\nURL: {article['url']}\n")
        file.write(f"TITLE: {article['title']}\n")
        file.write(f"CONTENT:\n{article['content']}\n")
        file.write("\n" + "="*80 + "\n")  # Separator between articles


In [10]:
# Create a directory to store individual article files
if not os.path.exists('extracted_articles'):
    os.makedirs('extracted_articles')

# Extract and save each article to individual files
for url_id, url in zip(data['URL_ID'], data['URL']):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title = soup.find('h1', class_='entry-title')
        title_text = title.get_text().strip() if title else "No title found"
        
        # Extract main content
        content = soup.find('div', class_='td-main-content')
        content_text = content.get_text().strip() if content else "No content found"
        
        # Save to individual file
        filename = os.path.join('extracted_articles', f"{url_id}.txt")
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title_text}\n\n")
            file.write(content_text)
            
    except Exception as e:
        print(f"Error processing {url_id}: {e}")

In [13]:
import os
import re

def clean_article_text(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+\.\S+', '', text)
    
    # Remove phone numbers (various formats)
    text = re.sub(r'\+?[\d\s-]{10,}', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove common contact details sections
    contact_patterns = [
        r'Contact Details.*',
        r'Here are my contact details:.*',
        r'Firm Name:.*',
        r'Firm Website:.*', 
        r'Firm Address:.*',
        r'Email:.*',
        r'Skype:.*',
        r'WhatsApp:.*',
        r'Telegram:.*'
    ]
    
    for pattern in contact_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE|re.DOTALL)
    
    # Remove multiple newlines and extra whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Process all files in extracted_articles directory
articles_dir = 'extracted_articles'
for filename in os.listdir(articles_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(articles_dir, filename)
        
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Clean the content
        cleaned_content = clean_article_text(content)
        
        # Write back the cleaned content
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

print("Finished cleaning all articles")


Finished cleaning all articles


In [16]:
# Create output directory if it doesn't exist
output_dir = 'cleaned_articles'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process all files in extracted_articles directory and save to cleaned_articles
articles_dir = 'extracted_articles'
for filename in os.listdir(articles_dir):
    if filename.endswith('.txt'):
        input_path = os.path.join(articles_dir, filename)
        output_path = os.path.join(output_dir, filename)
        
        # Read the file
        with open(input_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Clean the content
        cleaned_content = clean_article_text(content)
        
        # Write cleaned content to new location
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

print(f"Finished cleaning all articles. Cleaned files saved in {output_dir}/")


Finished cleaning all articles. Cleaned files saved in cleaned_articles/


### Sentiment Analysis

1.1 Using Stop Words

In [20]:
# Define paths
cleaned_articles_dir = 'cleaned_articles'
stop_words_dir = 'StopWords'

# Load all stop words from the StopWords directory
stop_words = set()

# List of stop word files to process
stop_word_files = [
    'StopWords_Auditor.txt',
    'StopWords_DatesandNumbers.txt', 
    'StopWords_Generic.txt',
    'StopWords_GenericLong.txt',
    'StopWords_Geographic.txt',
    'StopWords_Names.txt'
]

# Load stop words from each file
for filename in stop_word_files:
    try:
        file_path = os.path.join(stop_words_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Add each word from the file to the stop words set
            # Strip whitespace and remove any text after | character
            for line in file:
                word = line.split('|')[0].strip()
                if word:  # Only add non-empty strings
                    stop_words.add(word.lower())  # Convert to lowercase for case-insensitive matching
    except FileNotFoundError:
        print(f"Warning: Stop words file {filename} not found")
        continue
    except UnicodeDecodeError:
        # Try with a different encoding if utf-8 fails
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                for line in file:
                    word = line.split('|')[0].strip()
                    if word:
                        stop_words.add(word.lower())
        except:
            print(f"Warning: Could not read {filename} with either utf-8 or latin-1 encoding")
            continue

print(f"Loaded {len(stop_words)} stop words")

def remove_stop_words(text):
    """Remove stop words from text"""
    # Split text into words
    words = text.split()
    # Remove stop words (case insensitive)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Rejoin words
    return ' '.join(filtered_words)

# Process all files in cleaned_articles directory to remove stop words
for filename in os.listdir(cleaned_articles_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(cleaned_articles_dir, filename)
        
        try:
            # Read the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Remove stop words
            filtered_content = remove_stop_words(content)
            
            # Write back the filtered content
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(filtered_content)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("Finished removing stop words from all articles")


Loaded 12676 stop words
Finished removing stop words from all articles


In [21]:
# Create a directory for storing cleaned articles without stop words
cleaned_no_stopwords_dir = 'cleaned_articles_no_stopwords'
if not os.path.exists(cleaned_no_stopwords_dir):
    os.makedirs(cleaned_no_stopwords_dir)

# Copy cleaned articles to new directory and remove stop words
for filename in os.listdir(cleaned_articles_dir):
    if filename.endswith('.txt'):
        src_path = os.path.join(cleaned_articles_dir, filename)
        dst_path = os.path.join(cleaned_no_stopwords_dir, filename)
        
        try:
            # Read the source file
            with open(src_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Remove stop words
            filtered_content = remove_stop_words(content)
            
            # Write to destination file
            with open(dst_path, 'w', encoding='utf-8') as file:
                file.write(filtered_content)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print(f"Finished saving cleaned articles without stop words to {cleaned_no_stopwords_dir}")


Finished saving cleaned articles without stop words to cleaned_articles_no_stopwords


### 1.2 Master Dictionary

In [16]:
import re

def update_counters(file1_path, file2_path, file3_path):
    # Initialize counters
    file_1_count = 0
    file_2_count = 0

    try:
        # Read words from file1
        with open(file1_path, 'r', encoding='utf-8', errors='ignore') as file1:
            file1_words = set(file1.read().strip().splitlines())

        # Read words from file2
        with open(file2_path, 'r', encoding='utf-8', errors='ignore') as file2:
            file2_words = set(file2.read().strip().splitlines())

        # Read the paragraph from file3
        with open(file3_path, 'r', encoding='utf-8', errors='ignore') as file3:
            paragraph = file3.read()

        # Normalize the paragraph to handle case insensitivity and punctuations
        words_in_paragraph = re.findall(r'\b\w+\b', paragraph.lower())  # Extract words and make lowercase

        # Loop through words in the paragraph
        for word in words_in_paragraph:
            if word in file1_words:
                file_1_count += 1
            elif word in file2_words:
                file_2_count += 1

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return file_1_count, file_2_count

# Use the actual file paths
file1_path = 'MasterDictionary/positive-words.txt'
file2_path = 'MasterDictionary/negative-words.txt'
file3_path = 'cleaned_articles_no_stopwords.txt'

file_1_count, file_2_count = update_counters(file1_path, file2_path, file3_path)
print(f"Positive words count: {file_1_count}")
print(f"Negative words count: {file_2_count}")


No text files found in the folder: your_folder_path_here


TypeError: cannot unpack non-iterable NoneType object

In [17]:
import nltk
from nltk.tokenize import word_tokenize
import os

# Read positive and negative words from the master dictionary
def read_word_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # Skip any empty lines or comments
        words = [line.strip() for line in file if line.strip() and not line.startswith(';')]
    return set(words)

# Read positive and negative word lists
positive_words = read_word_list('MasterDictionary/positive-words.txt')
negative_words = read_word_list('MasterDictionary/negative-words.txt')

def calculate_sentiment_scores(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    
    # Calculate scores
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    negative_score = negative_score * -1  # Convert to positive number
    
    # Calculate polarity score
    denominator = (positive_score + abs(negative_score)) + 0.000001
    polarity_score = (positive_score - abs(negative_score)) / denominator
    
    # Calculate subjectivity score
    total_words = len(tokens)
    subjectivity_score = (positive_score + abs(negative_score)) / (total_words + 0.000001)
    
    return {
        'Positive_Score': positive_score,
        'Negative_Score': abs(negative_score),  # Return absolute value
        'Polarity_Score': polarity_score,
        'Subjectivity_Score': subjectivity_score
    }

# Process all articles in the cleaned directory
sentiment_scores = {}
cleaned_dir = 'cleaned_articles_no_stopwords'

for filename in os.listdir(cleaned_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(cleaned_dir, filename)
        
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Calculate scores for this article
            scores = calculate_sentiment_scores(text)
            sentiment_scores[filename] = scores
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("Finished calculating sentiment scores for all articles")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xef in position 3988: invalid continuation byte

In [None]:
# Example to print scores for a specific article
article_name = 'article1.txt'
if article_name in sentiment_scores:
    scores = sentiment_scores[article_name]
    print(f"Scores for {article_name}:")
    print(f"Positive Score: {scores['Positive_Score']}")
    print(f"Negative Score: {scores['Negative_Score']}")
    print(f"Polarity Score: {scores['Polarity_Score']}")
    print(f"Subjectivity Score: {scores['Subjectivity_Score']}")

In [20]:
import nltk
from nltk.tokenize import word_tokenize
import os
nltk.download('punkt_tab')

# Read positive and negative words from the master dictionary
def read_word_list(file_path):
    # Try UTF-8 first
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            words = [line.strip() for line in file if line.strip() and not line.startswith(';')]
        return set(words)
    except UnicodeDecodeError:
        # If UTF-8 fails, try latin-1 encoding
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                words = [line.strip() for line in file if line.strip() and not line.startswith(';')]
            return set(words)
        except Exception as e:
            print(f"Error reading file {file_path}: {str(e)}")
            return set()

# Read positive and negative word lists
positive_words = read_word_list('MasterDictionary/positive-words.txt')
negative_words = read_word_list('MasterDictionary/negative-words.txt')

def calculate_sentiment_scores(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    
    # Calculate scores
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    negative_score = negative_score * -1  # Convert to positive number
    
    # Calculate polarity score
    denominator = (positive_score + abs(negative_score)) + 0.000001
    polarity_score = (positive_score - abs(negative_score)) / denominator
    
    # Calculate subjectivity score
    total_words = len(tokens)
    subjectivity_score = (positive_score + abs(negative_score)) / (total_words + 0.000001)
    
    return {
        'Positive_Score': positive_score,
        'Negative_Score': abs(negative_score),  # Return absolute value
        'Polarity_Score': polarity_score,
        'Subjectivity_Score': subjectivity_score
    }

# Process all articles in the cleaned directory
sentiment_scores = {}
cleaned_dir = 'cleaned_articles_no_stopwords'

for filename in os.listdir(cleaned_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(cleaned_dir, filename)
        
        try:
            # Try reading with UTF-8 first
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
            except UnicodeDecodeError:
                # If UTF-8 fails, try latin-1
                with open(file_path, 'r', encoding='latin-1') as file:
                    text = file.read()
            
            # Calculate scores for this article
            scores = calculate_sentiment_scores(text)
            sentiment_scores[filename] = scores
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("Finished calculating sentiment scores for all articles")

# Print example scores


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jayantkrishnasingh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Finished calculating sentiment scores for all articles

Scores for Netclan20241024.txt:
Positive Score: 10
Negative Score: 3
Polarity Score: 0.538
Subjectivity Score: 0.045

Scores for Netclan20241030.txt:
Positive Score: 13
Negative Score: 6
Polarity Score: 0.368
Subjectivity Score: 0.059

Scores for Netclan20241018.txt:
Positive Score: 8
Negative Score: 7
Polarity Score: 0.067
Subjectivity Score: 0.032

Scores for Netclan20241150.txt:
Positive Score: 4
Negative Score: 4
Polarity Score: 0.000
Subjectivity Score: 0.041

Scores for Netclan20241144.txt:
Positive Score: 7
Negative Score: 3
Polarity Score: 0.400
Subjectivity Score: 0.050


In [24]:
# Create a DataFrame from sentiment scores
scores_data = []
for filename, scores in sentiment_scores.items():
    # Extract URL_ID from filename by removing .txt extension
    url_id = filename.replace('.txt', '')
    
    scores_data.append({
        'URL_ID': url_id,
        'Positive_Score': scores['Positive_Score'],
        'Negative_Score': scores['Negative_Score'], 
        'Polarity_Score': scores['Polarity_Score'],
        'Subjectivity_Score': scores['Subjectivity_Score']
    })

# Convert to DataFrame
scores_df = pd.DataFrame(scores_data)

# Save to CSV file
output_file = 'sentiment_scores.csv'
scores_df.to_csv(output_file, index=False)

print(f"Sentiment scores saved to {output_file}")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 10: invalid continuation byte

In [25]:
# Read the URLs mapping file into a DataFrame
urls_df = pd.read_excel('Input.xlsx')

# Merge the sentiment scores with URLs based on URL_ID
merged_df = pd.merge(scores_df, urls_df[['URL_ID', 'URL']], on='URL_ID', how='left')

# Save the merged DataFrame back to CSV
merged_df.to_csv('sentiment_scores.csv', index=False)

print("Added URLs to sentiment scores CSV file")


Added URLs to sentiment scores CSV file


In [21]:
for filename, scores in list(sentiment_scores.items())[:50]:  # Print first 5 articles' scores
    print(f"\nScores for {filename}:")
    print(f"Positive Score: {scores['Positive_Score']}")
    print(f"Negative Score: {scores['Negative_Score']}")
    print(f"Polarity Score: {scores['Polarity_Score']:.3f}")
    print(f"Subjectivity Score: {scores['Subjectivity_Score']:.3f}")


Scores for Netclan20241024.txt:
Positive Score: 10
Negative Score: 3
Polarity Score: 0.538
Subjectivity Score: 0.045

Scores for Netclan20241030.txt:
Positive Score: 13
Negative Score: 6
Polarity Score: 0.368
Subjectivity Score: 0.059

Scores for Netclan20241018.txt:
Positive Score: 8
Negative Score: 7
Polarity Score: 0.067
Subjectivity Score: 0.032

Scores for Netclan20241150.txt:
Positive Score: 4
Negative Score: 4
Polarity Score: 0.000
Subjectivity Score: 0.041

Scores for Netclan20241144.txt:
Positive Score: 7
Negative Score: 3
Polarity Score: 0.400
Subjectivity Score: 0.050

Scores for Netclan20241145.txt:
Positive Score: 14
Negative Score: 2
Polarity Score: 0.750
Subjectivity Score: 0.054

Scores for Netclan20241151.txt:
Positive Score: 4
Negative Score: 1
Polarity Score: 0.600
Subjectivity Score: 0.030

Scores for Netclan20241019.txt:
Positive Score: 10
Negative Score: 3
Polarity Score: 0.538
Subjectivity Score: 0.045

Scores for Netclan20241031.txt:
Positive Score: 56
Negative