<a href="https://colab.research.google.com/github/bunny-2425/Face-Recognization-based-attendence-system-Project/blob/main/Extraction_and_Text_Anlaysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [3]:
%%time

# Function to extract the article title and text from a URLs given in Input.xlxs file and extracting each articles into txt file

def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and remove unwanted elements (e.g., header, footer, etc.)
        for element in soup(["header", "footer"]):
            element.decompose()

        # Extract article title and text
        article_title = soup.find('title').text.strip()
        article_text = ""

        # Extract text from <div class="td-post-content tagdiv-type">
        article_div = soup.find('div', class_='td-post-content tagdiv-type')
        if article_div:
            article_text = article_div.get_text()
        return article_title, article_text

    except Exception:
        print(f"Error while extracting article from {url}: {Exception}")
        return None, None

# Function to save the article title and text to a text file

def save_article_to_file(url_id, article_title, article_text):
    if not os.path.exists("articles"):
        os.mkdir("articles")

    with open(f"articles/{url_id}.txt", "w", encoding="utf-8") as file:
        file.write(f"Title: {article_title}\n\n")
        file.write(article_text)

def main():
    input_file = "/content/Input.xlsx"
    df = pd.read_excel(input_file)

    for index, row in df.iterrows():
        url_id = row["URL_ID"]
        url = row["URL"]

        # Extract article title and text
        article_title, article_text = extract_article_text(url)

        # Check if extraction was successful
        if article_title and article_text:
            save_article_to_file(url_id, article_title, article_text)
            print(f"Article {url_id} extracted and saved successfully.")
        else:
            print(f"Failed to extract article {url_id}.")

if __name__ == "__main__":
    main()


Article bctech2011 extracted and saved successfully.
Article bctech2012 extracted and saved successfully.
Article bctech2013 extracted and saved successfully.
Article bctech2014 extracted and saved successfully.
Article bctech2015 extracted and saved successfully.
Article bctech2016 extracted and saved successfully.
Article bctech2017 extracted and saved successfully.
Article bctech2018 extracted and saved successfully.
Article bctech2019 extracted and saved successfully.
Article bctech2020 extracted and saved successfully.
Article bctech2021 extracted and saved successfully.
Article bctech2022 extracted and saved successfully.
Article bctech2023 extracted and saved successfully.
Article bctech2024 extracted and saved successfully.
Article bctech2025 extracted and saved successfully.
Article bctech2026 extracted and saved successfully.
Article bctech2027 extracted and saved successfully.
Article bctech2028 extracted and saved successfully.
Article bctech2029 extracted and saved success

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Load NLTK resources (you might need to download NLTK resources)
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
import pandas as pd
import os
import chardet
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Function to load positive and negative dictionaries from files
def load_dictionaries(positive_dict_file, negative_dict_file):
    # Detect encoding for positive dictionary file
    with open(positive_dict_file, 'rb') as file:
        result = chardet.detect(file.read())
        positive_encoding = result['encoding']

    # Detect encoding for negative dictionary file
    with open(negative_dict_file, 'rb') as file:
        result = chardet.detect(file.read())
        negative_encoding = result['encoding']

    # Read files with detected encoding
    with open(positive_dict_file, 'r', encoding=positive_encoding) as file:
        positive_words = set(file.read().splitlines())
    with open(negative_dict_file, 'r', encoding=negative_encoding) as file:
        negative_words = set(file.read().splitlines())

    return positive_words, negative_words

# Function to perform sentiment analysis and calculate scores
def calculate_sentiment_scores(text, positive_words, negative_words):
    sia = SentimentIntensityAnalyzer()
    tokens = word_tokenize(text)

    positive_score = 0
    negative_score = 0

    for word in tokens:
        # Remove punctuation and convert to lowercase
        word = word.lower()
        if word.isalpha():
            # Check if the word is in the positive dictionary
            if word in positive_words:
                positive_score += 1
            # Check if the word is in the negative dictionary
            if word in negative_words:
                negative_score += 1

    # Calculate sentiment analysis metrics
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

def main():
    input_data_file = "/content/Output Data Structure.xlsx"
    positive_dict_file = "/content/positive-words.txt"
    negative_dict_file = "/content/negative-words.txt"
    articles_dir = "articles"

    # Load dictionaries
    positive_words, negative_words = load_dictionaries(positive_dict_file, negative_dict_file)

    # Read output data structure Excel file
    output_data = pd.read_excel(input_data_file)

    results = []
    for index, row in output_data.iterrows():
        url_id = row["URL_ID"]
        url = row["URL"]
        article_file = os.path.join(articles_dir, f"{url_id}.txt")

        if os.path.exists(article_file):
            # Read article text from file
            with open(article_file, 'r', encoding='utf-8') as article:
                article_text = article.read()

            # Perform sentiment analysis
            positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(article_text, positive_words, negative_words)

            results.append({
                "URL_ID": url_id,
                "URL": url,
                "Positive_Score": positive_score,
                "Negative_Score": negative_score,
                "Polarity_Score": polarity_score,
                "Subjectivity_Score": subjectivity_score
            })

    # Create DataFrame from results
    result_df = pd.DataFrame(results)

    # Save results to Excel
    result_df.to_excel("sentiment_analysis_results.xlsx", index=False)

if __name__ == "__main__":
    main()


In [8]:
sentiment_analysis = pd.read_excel("sentiment_analysis_results.xlsx")

In [9]:
sentiment_analysis

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,138,45,0.508197,0.059667
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,19,6,0.520000,0.043630
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,19,10,0.310345,0.041076
3,bctech2014,https://insights.blackcoffer.com/effective-man...,13,6,0.368421,0.034545
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,17,3,0.700000,0.028986
...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,14,10,0.166667,0.023506
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,22,20,0.047619,0.026958
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,10,12,-0.090909,0.048458
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,0,0,0.000000,0.000000


In [10]:
import re
from nltk.tokenize import sent_tokenize

In [11]:
# Function to calculate average sentence length
def calculate_avg_sentence_length(sentences):
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)
    return total_words / total_sentences

# Function to calculate percentage of complex words
def calculate_percentage_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]
    return len(complex_words) / len(words)

# Function to calculate fog index
def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Function to calculate average number of words per sentence
def calculate_avg_words_per_sentence(words, sentences):
    return len(words) / len(sentences)

# Function to calculate complex word count
def calculate_complex_word_count(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]
    return len(complex_words)

# Function to calculate word count
def calculate_word_count(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    cleaned_words = [word for word in words if word not in stop_words and word.isalpha()]
    return len(cleaned_words)

# Function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouAEIOU"
    count = 0
    if word[-1] in ['e', 'E'] and word[-2:] != 'le' and word[-2:] != 'LE':
        word = word[:-1]
    for index, letter in enumerate(word):
        if index == 0 and letter in vowels:
            count += 1
        elif letter in vowels and word[index-1] not in vowels:
            count += 1
    return count

# Function to calculate syllable count per word
def calculate_syllable_count_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(count_syllables(word) for word in words)
    return syllable_count / len(words)

# Function to calculate personal pronoun count
def calculate_personal_pronouns(text):
    pronouns = ["I", "we", "my", "ours", "us"]
    pattern = r'\b(?:' + '|'.join(pronouns) + r')\b'
    matches = re.findall(pattern, text)
    return len(matches)

# Function to calculate average word length
def calculate_avg_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

def main():
    output_data_file = "Output Data Structure.xlsx"
    articles_dir = "articles"

    # Read output data structure Excel file
    output_data = pd.read_excel(output_data_file)

    results_ = []
    for index, row in output_data.iterrows():
        url_id = row["URL_ID"]
        article_file = os.path.join(articles_dir, f"{url_id}.txt")

        if os.path.exists(article_file):
            # Read article text from file
            with open(article_file, 'r', encoding='utf-8') as article:
                article_text = article.read()

            # Tokenize sentences for text analysis
            sentences = sent_tokenize(article_text)
            words = word_tokenize(article_text)

            # Calculate text analysis metrics
            avg_sentence_length = calculate_avg_sentence_length(sentences)
            percentage_complex_words = calculate_percentage_complex_words(article_text)
            fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
            avg_words_per_sentence = calculate_avg_words_per_sentence(words, sentences)
            complex_word_count = calculate_complex_word_count(article_text)
            word_count = calculate_word_count(article_text)
            syllable_count_per_word = calculate_syllable_count_per_word(article_text)
            personal_pronoun_count = calculate_personal_pronouns(article_text)
            avg_word_length = calculate_avg_word_length(article_text)

            results_.append({
                "URL_ID": url_id,
                "Avg_Sentence_Length": avg_sentence_length,
                "Percentage_Complex_Words": percentage_complex_words,
                "Fog_Index": fog_index,
                "Avg_Words_Per_Sentence": avg_words_per_sentence,
                "Complex_Word_Count": complex_word_count,
                "Word_Count": word_count,
                "Syllable_Count_Per_Word": syllable_count_per_word,
                "Personal_Pronoun_Count": personal_pronoun_count,
                "Avg_Word_Length": avg_word_length
            })

    # Create DataFrame from results
    result_df2 = pd.DataFrame(results_)

    # Save results to Excel
    result_df2.to_excel("text_analysis_results.xlsx", index=False)

if __name__ == "__main__":
    main()


In [12]:
text_analysis = pd.read_excel("text_analysis_results.xlsx")
text_analysis

Unnamed: 0,URL_ID,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllable_Count_Per_Word,Personal_Pronoun_Count,Avg_Word_Length
0,bctech2011,17.327684,0.740789,7.227389,17.327684,2272,1818,1.742093,2,5.570590
1,bctech2012,11.019231,0.806283,4.730205,11.019231,462,365,1.841187,1,6.057592
2,bctech2013,20.171429,0.784703,8.382452,20.171429,554,451,1.742210,1,5.628895
3,bctech2014,10.377358,0.812727,4.476034,10.377358,447,366,1.818182,1,5.869091
4,bctech2015,23.793103,0.715942,9.803618,23.793103,494,401,1.621739,1,5.269565
...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,29.171429,0.767875,11.975721,29.171429,784,600,1.576885,2,4.954946
143,bctech2154,23.969231,0.748395,9.887050,23.969231,1166,911,1.530809,4,4.836970
144,bctech2155,32.428571,0.724670,13.261296,32.428571,329,247,1.394273,13,4.707048
145,bctech2156,158.000000,0.721519,63.488608,158.000000,114,111,1.512658,0,4.987342


In [13]:
merged_df = pd.merge(sentiment_analysis, text_analysis, on='URL_ID')
merged_df

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllable_Count_Per_Word,Personal_Pronoun_Count,Avg_Word_Length
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,138,45,0.508197,0.059667,17.327684,0.740789,7.227389,17.327684,2272,1818,1.742093,2,5.570590
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,19,6,0.520000,0.043630,11.019231,0.806283,4.730205,11.019231,462,365,1.841187,1,6.057592
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,19,10,0.310345,0.041076,20.171429,0.784703,8.382452,20.171429,554,451,1.742210,1,5.628895
3,bctech2014,https://insights.blackcoffer.com/effective-man...,13,6,0.368421,0.034545,10.377358,0.812727,4.476034,10.377358,447,366,1.818182,1,5.869091
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,17,3,0.700000,0.028986,23.793103,0.715942,9.803618,23.793103,494,401,1.621739,1,5.269565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,14,10,0.166667,0.023506,29.171429,0.767875,11.975721,29.171429,784,600,1.576885,2,4.954946
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,22,20,0.047619,0.026958,23.969231,0.748395,9.887050,23.969231,1166,911,1.530809,4,4.836970
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,10,12,-0.090909,0.048458,32.428571,0.724670,13.261296,32.428571,329,247,1.394273,13,4.707048
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,0,0,0.000000,0.000000,158.000000,0.721519,63.488608,158.000000,114,111,1.512658,0,4.987342


In [14]:
merged_df.to_excel("OutputDataStructure.xlsx")

In [None]:
from google.colab import drive
drive.mount('/content/drive')