<a href="https://colab.research.google.com/github/anurag-chiplunkar/Citi-Bank-AIML-Basics/blob/master/NLP_Basics_and_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Bag-of-Words**

It represents the text by counting the frequency of words within a document.

In [4]:
import sklearn

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [15]:
documents = [
    "I love Machine Learning",
    "Machine Learning is great",
    "I love coding"
]

In [16]:
vec = CountVectorizer()

In [17]:
X = vec.fit_transform(documents)

In [18]:
X.toarray()

array([[0, 0, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 1],
       [1, 0, 0, 0, 1, 0]])

In [19]:
vec.get_feature_names_out()

array(['coding', 'great', 'is', 'learning', 'love', 'machine'],
      dtype=object)

**Term Frequency-Inverse Document Frequency (TF-IDF)**

In [21]:
vecTFID = TfidfVectorizer()

In [22]:
X_tfid = vecTFID.fit_transform(documents)

In [23]:
X_tfid.toarray()

array([[0.        , 0.        , 0.        , 0.57735027, 0.57735027,
        0.57735027],
       [0.        , 0.5628291 , 0.5628291 , 0.42804604, 0.        ,
        0.42804604],
       [0.79596054, 0.        , 0.        , 0.        , 0.60534851,
        0.        ]])

In [24]:
vecTFID.get_feature_names_out()

array(['coding', 'great', 'is', 'learning', 'love', 'machine'],
      dtype=object)

**Word Embedding (using spaCy)**

In [25]:
import spacy

In [26]:
nlp = spacy.load("en_core_web_sm")

In [28]:
words = [ "machine", "learning", "coding"]

In [30]:
for word in words:
  print(f"{word}: {nlp(word).vector[:5]}")

machine: [-1.1848618  -0.5884644  -0.431729    0.04726774  0.15745789]
learning: [-1.4594722  0.5993646 -0.3100496  0.4292125 -1.0241096]
coding: [-0.41865578  0.9925707  -0.8835844   0.29665527 -0.3790677 ]


**Transformers**

In [33]:
from transformers import BertTokenizer, BertModel

In [34]:
import torch

In [35]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [36]:
model = BertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [37]:
text = "Machine learning is fascinating"

In [38]:
inputs = tokenizer(text, return_tensors='pt')

In [40]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
learning_idx = inputs['input_ids'][0].tolist().index(tokenizer.convert_tokens_to_ids('learning'))
outputs.last_hidden_state[0][learning_idx]

# **Bag of Words from scratch**

In [44]:
%%writefile 1.txt
This is a story about cats our feline pets
Cats are furry animals

Writing 1.txt


In [46]:
%%writefile 2.txt
This story is about surfing
Catchine waves is fun
Surfing is a popular water sport

Writing 2.txt


In [64]:
# Building the vocabulary

vocab = {}
i = 1

with open('/content/1.txt') as f:
  x = f.read().lower().split()


for word in x:
  if word in vocab:
    continue
  else:
    vocab[word]= i
    i += 1

print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}


In [65]:
# Building the vocabulary

with open('/content/2.txt') as f:
  x = f.read().lower().split()


for word in x:
  if word in vocab:
    continue
  else:
    vocab[word]= i
    i += 1

print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catchine': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}


In [98]:
# Feature Extraction
one = ['/content/1.txt']+[0]*len(vocab)
two = ['/content/2.txt']+[0]*len(vocab)

print(f"{one}\n{two}")


['/content/1.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['/content/2.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [70]:
with open('/content/1.txt') as f:
  x = f.read().lower().split()

for word in x:
  one[vocab[word]] += 1

one

['/content/1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

In [71]:
with open('/content/2.txt') as f:
  x = f.read().lower().split()

for word in x:
  two[vocab[word]] += 1

print(f"{one}\n{two}")

['/content/1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
['/content/2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]


# **Text Classification using tools**

In [73]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   length   5572 non-null   int64 
 3   punct    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.2+ KB


In [75]:
# ham and spam label count

df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [78]:
from sklearn.model_selection import train_test_split
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [100]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC

In [101]:
from sklearn.pipeline import Pipeline
textClf = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

textClf.fit(X_train, y_train)



In [95]:
# Train Classifier
predictions = textClf.predict(X_test)

In [96]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions))

[[1445    3]
 [  10  214]]


In [97]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672

