<a href="https://colab.research.google.com/github/aimlresearcher/NLP/blob/main/ex02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Demo 01

In [1]:
import numpy as np
import pandas as pd
import csv

from sklearn.datasets import fetch_20newsgroups

In [2]:
posts = fetch_20newsgroups(subset = 'all', categories = ['sci.electronics', 'sci.space'])
posts.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
posts.data[1]

"From: gerald.belton@ozonehole.com (Gerald Belton) \nSubject: Need to find out numb\nDistribution: world\nOrganization: Ozone Online Operations, Inc. - New Orleans, LA\nReply-To: gerald.belton@ozonehole.com (Gerald Belton) \nLines: 24\n\nAL>>        Question:   Is there a certain device out there that I can\nAL>>                    use to find out the number to the line?\nAL>>        Thanks for any response.\nAL>>                                                    Al\n\nAL>There is a number you can call which will return a synthesized\nAL>voice telling you the number of the line.  Unfortunately, for the\nAL>life of me I can't remember what it is. The telephone technicians\nAL>use it all the time.  We used to play around with this in our\nAL>dorm rooms since there were multiple phone lines running between\nAL>rooms.\n\nIt probably wouldn't help for you to post the number, since it appears\nto be different in each area.  For what it's worth, in the New Orleans\narea the number is 998-877

In [4]:
df = pd.DataFrame({
    'text': posts.data,
    'label': [posts.target_names[target] for target in posts.target]
})
df.head()

Unnamed: 0,text,label
0,From: steinly@topaz.ucsc.edu (Steinn Sigurdsso...,sci.space
1,From: gerald.belton@ozonehole.com (Gerald Belt...,sci.electronics
2,From: hillig@U.Chem.LSA.UMich.EDU (Kurt Hillig...,sci.space
3,From: jbh55289@uxa.cso.uiuc.edu (Josh Hopkins)...,sci.space
4,From: jhwhit01@ulkyvx.louisville.edu\nSubject:...,sci.electronics


In [5]:
df.shape

(1971, 2)

## Demo 02

In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
posts = fetch_20newsgroups(subset = 'all', categories = ['sci.electronics', 'sci.space'],
                           remove = ('headers', 'footers', 'quotes'))
posts.data[1]

"AL>>        Question:   Is there a certain device out there that I can\nAL>>                    use to find out the number to the line?\nAL>>        Thanks for any response.\nAL>>                                                    Al\n\nAL>There is a number you can call which will return a synthesized\nAL>voice telling you the number of the line.  Unfortunately, for the\nAL>life of me I can't remember what it is. The telephone technicians\nAL>use it all the time.  We used to play around with this in our\nAL>dorm rooms since there were multiple phone lines running between\nAL>rooms.\n\nIt probably wouldn't help for you to post the number, since it appears\nto be different in each area.  For what it's worth, in the New Orleans\narea the number is 998-877-6655 (easy to remember, what?)\n\n\n * SLMR 2.1 * Ask me anything: if I don't know, I'll make up something.\n                                          "

In [8]:
df = pd.DataFrame({
    'text': posts.data,
    'label': [posts.target_names[target] for target in posts.target]
})

In [9]:
df.head()

Unnamed: 0,text,label
0,\n >\tIf the new Kuiper belt object *is* ...,sci.space
1,AL>> Question: Is there a certain dev...,sci.electronics
2,"\nIt's not quite what you were asking, but a f...",sci.space
3,"\n\n\nNo, the sky does not, at this time, belo...",sci.space
4,"\nDigi-Key also sells Quad Line Receivers, pa...",sci.electronics


In [10]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  clean_text = ' '.join(tokens)
  return clean_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
df['clean_text'] = df['text'].apply(clean_text)

In [12]:
df.head()

Unnamed: 0,text,label,clean_text
0,\n >\tIf the new Kuiper belt object *is* ...,sci.space,new kuiper belt object called next one called ...
1,AL>> Question: Is there a certain dev...,sci.electronics,al question certain device al use find number ...
2,"\nIt's not quite what you were asking, but a f...",sci.space,quite asking year ago helped ee remote sensing...
3,"\n\n\nNo, the sky does not, at this time, belo...",sci.space,sky time belong anyone ownership necessary def...
4,"\nDigi-Key also sells Quad Line Receivers, pa...",sci.electronics,also sell quad line receiver part quad line dr...


In [13]:
clean_data = df[['clean_text', 'label']]
clean_data.head()

Unnamed: 0,clean_text,label
0,new kuiper belt object called next one called ...,sci.space
1,al question certain device al use find number ...,sci.electronics
2,quite asking year ago helped ee remote sensing...,sci.space
3,sky time belong anyone ownership necessary def...,sci.space
4,also sell quad line receiver part quad line dr...,sci.electronics


## Demo 03
### Feature Extraction

#### Convert the preprocessed text into a numerical format
- Bag-of-words
- TF-IDF

In [15]:
X = df['clean_text']
y = df['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1478,) (1478,)
(493,) (493,)


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df = 10)
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
count_df = pd.DataFrame(X_train_counts.toarray(), columns = count_vect.get_feature_names_out())
count_df.head()

Unnamed: 0,ability,able,absolutely,ac,acceleration,accept,accepted,access,accomplish,according,...,wrong,wrote,yeah,year,yellow,yes,yesterday,yet,york,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
'''from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df = 10,ngram_range = (2, 2))
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
count_df = pd.DataFrame(X_train_counts.toarray(), columns = count_vect.get_feature_names_out())
count_df.head()'''

'from sklearn.feature_extraction.text import CountVectorizer\ncount_vect = CountVectorizer(min_df = 10,ngram_range = (2, 2))\nX_train_counts = count_vect.fit_transform(X_train)\nX_test_counts = count_vect.transform(X_test)\ncount_df = pd.DataFrame(X_train_counts.toarray(), columns = count_vect.get_feature_names_out())\ncount_df.head()'

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df = 0.7, min_df = 0.01)

tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

tfidf_df = pd.DataFrame(tfidf_train.toarray(), columns = tfidf_vect.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,able,ac,acceleration,access,according,across,act,action,active,activity,...,would,write,writing,written,wrong,wrote,year,yes,yet,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.070156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Demo 04

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb = MultinomialNB()
nb.fit(X_train_counts, y_train)
y_pred = nb.predict(X_test_counts)
metrics.accuracy_score(y_test, y_pred)

0.9006085192697769

In [36]:
labels = ['sci.electronics', 'sci.space']

cm = metrics.confusion_matrix(y_test, y_pred, labels = labels)
cm_df = pd.DataFrame(cm, index = labels, columns = labels)

print("Confusion Matrix")
print(cm_df)

Confusion Matrix
                 sci.electronics  sci.space
sci.electronics              227          7
sci.space                     42        217
