In [8]:
# Task 1: Import necessary libraries

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy
from spacy import displacy
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Task 2: Load the dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/BBC_DATA.csv"
df = pd.read_csv(file_path)
display(df.head())

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [4]:
# Task 3: Tokenization with NLTK

sample_article = df.iloc[0, 1]  # Assuming the 'Text' column contains the news articles
tokens_words = word_tokenize(sample_article)
tokens_sentences = sent_tokenize(sample_article)

In [5]:
print("\nTokenization with NLTK:")
print("Tokenized Words:", tokens_words)
print("Tokenized Sentences:", tokens_sentences)


Tokenization with NLTK:


In [6]:
# Task 4: Stemming and Lemmatization with NLTK
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [9]:
stemmed_words = [porter_stemmer.stem(word) for word in tokens_words]
lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in tokens_words]

In [10]:
print("\nStemming with NLTK:", stemmed_words)
print("Lemmatization with NLTK:", lemmatized_words)


Stemming with NLTK: ['worldcom', 'ex-boss', 'launch', 'defenc', 'lawyer', 'defend', 'former', 'worldcom', 'chief', 'berni', 'ebber', 'against', 'a', 'batteri', 'of', 'fraud', 'charg', 'have', 'call', 'a', 'compani', 'whistleblow', 'as', 'their', 'first', 'wit', '.', 'cynthia', 'cooper', 'worldcom', 's', 'ex-head', 'of', 'intern', 'account', 'alert', 'director', 'to', 'irregular', 'account', 'practic', 'at', 'the', 'us', 'telecom', 'giant', 'in', '2002.', 'her', 'warn', 'led', 'to', 'the', 'collaps', 'of', 'the', 'firm', 'follow', 'the', 'discoveri', 'of', 'an', '$', '11bn', '(', '£5.7bn', ')', 'account', 'fraud', '.', 'mr', 'ebber', 'ha', 'plead', 'not', 'guilti', 'to', 'charg', 'of', 'fraud', 'and', 'conspiraci', '.', 'prosecut', 'lawyer', 'have', 'argu', 'that', 'mr', 'ebber', 'orchestr', 'a', 'seri', 'of', 'account', 'trick', 'at', 'worldcom', 'order', 'employe', 'to', 'hide', 'expens', 'and', 'inflat', 'revenu', 'to', 'meet', 'wall', 'street', 'earn', 'estim', '.', 'but', 'ms', 'c

In [11]:
# Task 5: Named Entity Recognition with SpaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_article)
displacy.render(doc, style="ent", jupyter=True)

In [12]:
# Task 6: Word2Vec with gensim
sentences = [word_tokenize(article) for article in df['Text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [13]:
# Get vector representation of a sample word
sample_word_vector = word2vec_model.wv['sample']
print("\nWord2Vec with gensim - Vector representation of 'sample':", sample_word_vector)


Word2Vec with gensim - Vector representation of 'sample': [-0.01151732  0.0864388  -0.00820681  0.01198654  0.04814653 -0.10581327
 -0.02857972  0.13516779 -0.06370393 -0.0513782  -0.02205362 -0.06476305
  0.00464555  0.00840096  0.06711    -0.06885602  0.07714604 -0.06072763
  0.03082088 -0.10884771  0.05934846  0.00984306  0.06359379 -0.0338015
  0.01568069 -0.046829   -0.11158412  0.01679148 -0.05996404 -0.00053319
  0.09854335 -0.0388101   0.02445992 -0.12494642  0.00191479  0.05541306
  0.01439658 -0.01624904 -0.02484631 -0.09023548 -0.01364749 -0.05923644
 -0.04967679  0.04008827  0.0537195   0.02267633 -0.07054133  0.00340478
  0.00569881  0.02455191  0.05622242 -0.08629406 -0.0229742  -0.02066166
  0.01637956  0.01121017  0.06532785  0.03676561 -0.07041118 -0.01215285
  0.01180393  0.01714104 -0.01940402  0.03000361 -0.01001607  0.06273773
  0.0072099   0.03676554 -0.05372756  0.04453144 -0.04235607 -0.01348015
  0.09369236  0.02946641  0.0319985  -0.05690531  0.01878851 -0.03

In [14]:
# Task 7: TF-IDF with scikit-learn
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Text'])

In [15]:
# Calculate cosine similarity between two news articles
article1_index = 0
article2_index = 1
cosine_sim = cosine_similarity(tfidf_matrix[article1_index], tfidf_matrix[article2_index])

In [16]:
print("\nTF-IDF with scikit-learn - Cosine Similarity between Article 1 and Article 2:", cosine_sim[0][0])


TF-IDF with scikit-learn - Cosine Similarity between Article 1 and Article 2: 0.07875931547482325
