## Preprocessing the Scraped Data

### Import all the Necessary Libraries

In [1]:
import pandas as pd
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Assigning DataFrame 

In [2]:
Health = pd.read_csv("./Data/Health.csv")
Environment = pd.read_csv("./Data/Environment.csv")
Technology = pd.read_csv("./Data/Technology.csv")
Economy = pd.read_csv("./Data/Economy.csv")
Entertainment = pd.read_csv("./Data/Entertainment.csv")
Sports = pd.read_csv("./Data/Sports.csv")
Politics = pd.read_csv("./Data/Politics.csv")
Education = pd.read_csv("./Data/Education.csv")
Travel = pd.read_csv("./Data/Travel.csv")
Food = pd.read_csv("./Data/Food.csv")


### Initializing the Preprocessing parameters

In [3]:

# Initialize stemmer, lemmatizer and stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


### Preprocess the Data

In [4]:
# Define a preprocessing function
def preprocess(text):
    
    # Tokenization
    tokens = word_tokenize(re.sub(r"[^a-zA-Z]", " ", text.lower()))
    
    # Stemming, Lemmatization and Stopwords removal.
    text_processed = [lemmatizer.lemmatize(stemmer.stem(token)) for token in tokens if token not in stop_words]
    
    #combine everything onto processed text
    return ' '.join(text_processed)


In [5]:
# Assign the location of the Data that needs to be preprocessed.
Data_location = ['./Data/Health.csv', './Data/Environment.csv', './Data/Technology.csv', 
              './Data/Economy.csv', './Data/Entertainment.csv', './Data/Sports.csv', 
              './Data/Politics.csv', './Data/Education.csv', './Data/Travel.csv', './Data/Food.csv']

# Creating an empty DataFrame to copy all the preprocessed text and other columns along with it.
preprocessed_text = pd.DataFrame()

# CSV files are loaded in each loop.
# Every single CSV is a Topic.
# Each and every topic's Summary is preprocessed and concat onto the preprocessed DataFrame.
# Every other column are also appended onto the DataFrame.
for iterate in Data_location:
    dataFrame = pd.read_csv(iterate)
    dataFrame['Summary'] = dataFrame['Summary'].apply(preprocess)
    preprocessed_text = pd.concat([preprocessed_text, dataFrame], ignore_index=True)


In [6]:
# Required DataFrame info
print(preprocessed_text.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        56000 non-null  object
 1   Title        56000 non-null  object
 2   Summary      56000 non-null  object
 3   URL          56000 non-null  object
 4   Revision ID  56000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB
None


In [7]:
# Converting the DataFrame onto a csv file for indexing.
csv_file_name = "preprocessed_text.csv"
preprocessed_text.to_csv(csv_file_name, index=False)

In [8]:
preprocessed_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        56000 non-null  object
 1   Title        56000 non-null  object
 2   Summary      56000 non-null  object
 3   URL          56000 non-null  object
 4   Revision ID  56000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


### Verification of Every Grading Criteria

In [9]:
# Checking the characters length of Summary in each document with respect to the grading criteria (Summary length).
# Total Docs = 5500
# Docs in Each Topic = 550
# All Fields are named as required from topic, title, summary, url and revision_id.
data = pd.read_csv('preprocessed_text.csv') 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        56000 non-null  object
 1   Title        56000 non-null  object
 2   Summary      56000 non-null  object
 3   URL          56000 non-null  object
 4   Revision ID  56000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [10]:
summary = data['Summary'].apply(lambda x: len(x) >= 200)
char_count = summary.value_counts()
char_count

Summary
True     52703
False     3297
Name: count, dtype: int64

In [11]:
total_docs = len(data)
below_required_chars = (char_count[False] / total_docs) * 100

# Checking whether Not more than 5% of documents of length less than 200 characters in the summary.
print(below_required_chars,"%")

5.887499999999999 %


In [12]:
# Checking whether Not more than 5% of documents should have any other characters other than alphanumeric.
alphabet_check = data['Summary'].apply(lambda x: any(not c.isalnum() and not c.isspace() for c in x))
total_count = alphabet_check.sum()
percentage_non_alpha = (total_count / len(data)) * 100

# Checking whether the percentage is below 5%.
# returns True, if it's below.
# returns False if it's above.
result = percentage_non_alpha <= 5
result

True