# **DATA EXTRACTION & TEXT ANALYSIS**

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import os
os.chdir("/kaggle/input/data-extraction-and-text-analysis")

In [3]:
# Get the current working directory
current_directory = os.getcwd()

# Construct the file path
file_path = os.path.join(current_directory, 'IndiaTVNews.xlsx')

# Load the input data from Excel
df = pd.read_excel(file_path)
df

Unnamed: 0,article_no,article_link
0,0,https://www.indiatvnews.com/technology/news/mo...
1,1,https://www.indiatvnews.com/technology/news/op...
2,2,https://www.indiatvnews.com/technology/news/on...
3,3,https://www.indiatvnews.com/technology/news/io...
4,4,https://www.indiatvnews.com/technology/news/mi...
...,...,...
1291,1291,https://www.indiatvnews.com/sports/cricket/aus...
1292,1292,https://www.indiatvnews.com/news/india/north-i...
1293,1293,https://www.indiatvnews.com/video/news/indian-...
1294,1294,https://www.indiatvnews.com/news/india/isro-xp...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_no    1296 non-null   int64 
 1   article_link  1296 non-null   object
dtypes: int64(1), object(1)
memory usage: 20.4+ KB


In [5]:
# Create a folder to save text files in the writable directory
output_folder = '/kaggle/working/extracted_texts'
os.makedirs(output_folder, exist_ok=True) 

**DATA EXTRACTION**

In [6]:
# Function to extract text from a given URL
def extract_text_from_article(article_link):
    try:
        # Send a request to the URL
        response = requests.get(article_link)
        response.raise_for_status()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the article title and text
        title = soup.title.text if soup.title else 'Untitled'
        article_text = ' '.join([p.text for p in soup.find_all('p')])

        return title, article_text
    except Exception as e:
        print(f"Error extracting data from {article_link}: {e}")
        return None, None

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    article_no = row['article_no']
    article_link = row['article_link']

    # Extract text from the URL
    title, article_text = extract_text_from_article(article_link)

    if title and article_text:
        # Save the extracted text to a text file
        output_file = os.path.join(output_folder, f'{article_no}.txt')
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(f'Title: {title}\n\n')
            file.write(f'{article_text}\n')
print("Extraction completed.")

Extraction completed.


In [8]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
# Create an empty DataFrame to store results
output_df = pd.DataFrame(columns=['Unnamed: 0', 'article_link', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                                   'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                                   'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
                                   'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Text Analysis**

In [9]:
# Function to calculate text analysis variables
def calculate_text_analysis(text):
    # Cleaning using Stop Words Lists
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # Creating a dictionary of Positive and Negative words
    positive_words = set(["positive", "words", "list"])  # Replace with actual positive words
    negative_words = set(["negative", "words", "list"])  # Replace with actual negative words

    # Extracting Derived variables
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)
    
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)

    # Analysis of Readability
    sentences = sent_tokenize(text)
    words_per_sentence = sum(len(word_tokenize(sentence)) for sentence in sentences)
    average_sentence_length = words_per_sentence / len(sentences)

    complex_word_count = sum(1 for word in cleaned_words if len(re.findall(r'\b\w{3,}\b', word)) > 0)
    percentage_of_complex_words = complex_word_count / len(cleaned_words)

    fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)

    average_number_of_words_per_sentence = len(cleaned_words) / len(sentences)

    # Personal Pronouns
    personal_pronouns_count = len(re.findall(r'\b(?:I|we|my|ours|us)\b', text, flags=re.IGNORECASE))

    # Average Word Length
    average_word_length = sum(len(word) for word in cleaned_words) / len(cleaned_words)

    return positive_score, negative_score, polarity_score, subjectivity_score, \
           average_sentence_length, percentage_of_complex_words, fog_index, \
           average_number_of_words_per_sentence, complex_word_count, len(cleaned_words), \
           personal_pronouns_count, average_word_length

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    article_no = row['article_no']
    article_link = row['article_link']

    # Extract text from the URL
    title, article_text = extract_text_from_article(article_link)

    if article_text:
        # Calculate text analysis variables
        text_analysis_results = calculate_text_analysis(article_text)

        # Append to the output DataFrame
        df_output = pd.DataFrame({'article_no': [article_no], 'article_link': [article_link], **dict(zip(output_df.columns[2:], text_analysis_results))})

        output_df = pd.concat([output_df, df_output], ignore_index=True)

print("Text analysis completed.")

output_file = '/kaggle/working/Text_Analysis.xlsx'
output_df.to_excel(output_file, index=False)

print(f"Text analysis results saved to {output_file}")

  output_df = pd.concat([output_df, df_output], ignore_index=True)


Text analysis completed.
Text analysis results saved to /kaggle/working/Text_Analysis.xlsx


In [14]:
saved_df = pd.read_excel(output_file)
saved_df

Unnamed: 0.1,Unnamed: 0,article_link,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,PERSONAL PRONOUNS,AVG WORD LENGTH,article_no
0,,https://www.indiatvnews.com/technology/news/mo...,0,0,0.0,0.0,39.027027,0.975381,16.000963,23.054054,832,853,1,6.090270,0
1,,https://www.indiatvnews.com/technology/news/op...,0,0,0.0,0.0,46.333333,0.980606,18.925576,27.500000,809,825,1,6.052121,1
2,,https://www.indiatvnews.com/technology/news/on...,0,0,0.0,0.0,47.066667,0.978923,19.218236,28.466667,836,854,1,6.203747,2
3,,https://www.indiatvnews.com/technology/news/io...,0,0,0.0,0.0,42.285714,0.982857,17.307429,25.000000,860,875,1,6.181714,3
4,,https://www.indiatvnews.com/technology/news/mi...,0,0,0.0,0.0,41.375000,0.983750,16.943500,25.000000,787,800,1,6.236250,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,,https://www.indiatvnews.com/sports/cricket/aus...,0,0,0.0,0.0,43.441176,0.983393,17.769828,24.794118,829,843,9,5.900356,1291
1292,,https://www.indiatvnews.com/news/india/north-i...,0,0,0.0,0.0,50.565217,0.981818,20.618814,31.086957,702,715,1,6.006993,1292
1293,,https://www.indiatvnews.com/video/news/indian-...,0,0,0.0,0.0,58.888889,0.977273,23.946465,36.666667,645,660,1,5.924242,1293
1294,,https://www.indiatvnews.com/news/india/isro-xp...,0,0,0.0,0.0,41.977273,0.980374,17.183059,24.318182,1049,1070,4,6.330841,1294
