In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import os

In [None]:
# Download required nltk packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Load data (replace 'data.csv' with your actual file)
# Assuming data has a column 'text' with the news content
df = pd.read_csv('/content/scraped_article 1.csv')

In [None]:
# Function to clean text
def clean_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove punctuation and numbers
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)

        # Remove special characters
        text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Join tokens back into string
        cleaned_text = ' '.join(tokens)

        return cleaned_text
    else:
        # Handle non-string values (e.g., float)
        # You can return an empty string, a placeholder, or handle it differently
        return ''  # Return an empty string for non-string values

In [None]:
# Directory with CSV files
input_directory = '/content/articles'
output_directory = '/content/cleaned articles'


In [None]:
# Ensure output directory exists
os.makedirs(output_directory, exist_ok=True)

In [None]:
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        # Load data
        filepath = os.path.join(input_directory, filename)
        df = pd.read_csv(filepath)

        # Clean text column (assuming 'text' is the column name with news content)
        if 'Paragraph' in df.columns:
            df['cleaned_text'] = df['Paragraph'].apply(clean_text)

            # Save cleaned data to a new CSV file in the output directory
            output_filepath = os.path.join(output_directory, f'cleaned_{filename}')
            df.to_csv(output_filepath, index=False)
            print(f"Processed and saved: {output_filepath}")
        else:
            print(f"Column 'Paragraph' not found in {filename}")

Processed and saved: /content/cleaned articles/cleaned_scraped_article 2.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 13.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 20.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 17.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 15.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 11.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 1.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 19.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 8.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 3.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 4.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article 7.csv
Processed and saved: /content/cleaned articles/cleaned_scraped_article

In [None]:
# Apply cleaning function to the dataset
# Assuming your column is named 'Paragraph'
df['cleaned_text'] = df['Paragraph'].apply(clean_text)

# Preview cleaned data
print(df[['Paragraph', 'cleaned_text']].head()) # Changed 'text' to 'Paragraph'

# Save cleaned data (optional)
df.to_csv('cleaned_data.csv', index=False)

                                           Paragraph  \
0                       To enjoy additional benefits   
1                                    CONNECT WITH US   
2  \n\t\t\t\t\tCopyright© 2024, THG PUBLISHING PV...   
3                                        BACK TO TOP   
4    Terms & conditions  |  Institutional Subscriber   

                                        cleaned_text  
0                           enjoy additional benefit  
1                                          connect u  
2  copyright thg publishing pvt ltd affiliated co...  
3                                           back top  
4            term condition institutional subscriber  


In [None]:
# Directory containing the cleaned CSV files
input_directory = '/content/cleaned articles'
output_file = 'combined_cleaned_data.csv'

In [None]:
# Initialize an empty list to hold the data
data_frames = []


In [None]:
# Iterate over all cleaned CSV files in the directory
for filename in os.listdir(input_directory):
    if filename.startswith('cleaned_') and filename.endswith('.csv'):
        filepath = os.path.join(input_directory, filename)

        # Read each cleaned CSV file and append it to the list
        df = pd.read_csv(filepath)
        data_frames.append(df)


In [None]:
# Concatenate all dataframes in the list
combined_df = pd.concat(data_frames, ignore_index=True)

In [None]:
# Save the combined data to a single CSV file
combined_df.to_csv(output_file, index=False)
print(f"All cleaned files have been combined and saved as {output_file}")

All cleaned files have been combined and saved as combined_cleaned_data.csv
