In [1]:
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Load the dataset
data_path = 'https://github.com/PICT-NLP/BE-NLP-Elective/raw/main/3-Preprocessing/News_dataset.pickle'
data = pd.read_pickle(data_path)


In [5]:
data

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575
...,...,...,...,...,...,...
2220,397.txt,BT program to beat dialler scams\r\n\r\nBT is ...,tech,397.txt-tech,1,2526
2221,398.txt,Spam e-mails tempt net shoppers\r\n\r\nCompute...,tech,398.txt-tech,1,2294
2222,399.txt,Be careful how you code\r\n\r\nA new European ...,tech,399.txt-tech,1,6297
2223,400.txt,US cyber security chief resigns\r\n\r\nThe man...,tech,400.txt-tech,1,2323


In [7]:

# Text cleaning
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = ' '.join([word for word in text.split() if word.isalpha()])  # Remove non-alphabetic characters
    return text

data['cleaned_text'] = data['Content'].apply(clean_text)

# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

data['lemmatized_text'] = data['cleaned_text'].apply(lemmatize_text)

# Remove stop words
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

data['stopwords_removed'] = data['lemmatized_text'].apply(remove_stopwords)

# Label encoding
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['Category'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['stopwords_removed'])

# Save outputs
data.to_pickle('cleaned_data.pickle')
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pickle', 'wb'))


In [11]:
data_path = '/content/cleaned_data.pickle'
data = pd.read_pickle(data_path)
data

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length,cleaned_text,lemmatized_text,stopwords_removed,label_encoded
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569,ad sales boost time warner profit quarterly pr...,ad sale boost time warner profit quarterly pro...,ad sale boost time warner profit quarterly pro...,0
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257,dollar gains on greenspan speech the dollar ha...,dollar gain on greenspan speech the dollar ha ...,dollar gain greenspan speech dollar ha hit hig...,0
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557,yukos unit buyer faces loan claim the owners o...,yukos unit buyer face loan claim the owner of ...,yukos unit buyer face loan claim owner embattl...,0
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421,high fuel prices hit profits british airways h...,high fuel price hit profit british airway ha b...,high fuel price hit profit british airway ha b...,0
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575,pernod takeover talk lifts domecq shares in uk...,pernod takeover talk lift domecq share in uk d...,pernod takeover talk lift domecq share uk drin...,0
...,...,...,...,...,...,...,...,...,...,...
2220,397.txt,BT program to beat dialler scams\r\n\r\nBT is ...,tech,397.txt-tech,1,2526,bt program to beat dialler scams bt is introdu...,bt program to beat dialler scam bt is introduc...,bt program beat dialler scam bt introducing tw...,4
2221,398.txt,Spam e-mails tempt net shoppers\r\n\r\nCompute...,tech,398.txt-tech,1,2294,spam tempt net shoppers computer users across ...,spam tempt net shopper computer user across th...,spam tempt net shopper computer user across wo...,4
2222,399.txt,Be careful how you code\r\n\r\nA new European ...,tech,399.txt-tech,1,6297,be careful how you code a new european directi...,be careful how you code a new european directi...,careful code new european directive could put ...,4
2223,400.txt,US cyber security chief resigns\r\n\r\nThe man...,tech,400.txt-tech,1,2323,us cyber security chief resigns the man making...,u cyber security chief resigns the man making ...,u cyber security chief resigns man making sure...,4



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [12]:
data_path = '/content/tfidf_vectorizer.pickle'
data = pd.read_pickle(data_path)
data