<a href="https://colab.research.google.com/github/artanebibi/datascience/blob/main/Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.under_sampling import RandomUnderSampler

In [2]:
corpus = ["This is a brown house. This house is big. The street number is 1.",
          "This is a small house. This house has 1 bedroom. The street number is 12.",
          "This dog is brown. This dog likes to play.",
          "The dog is in the bedroom."]

# Text Vectorization

Text vectorization is the process of converting words into numerical vectors.
These vectors will help the machine learning models to learn the words and their "meaning". What machine learning models do is not learn their meaning, but rather they kind of "understand" a words meaning based on the words that are around a certain word (in the environment, which can be accessed by a skip-gram of a certain size)

In [3]:
# Binary term frequency will capture the presence or the absence of a word in a thesaurus
binary = TfidfVectorizer(binary=True, norm=None, use_idf=False, smooth_idf=False, lowercase=True, stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b', min_df=1, max_df=1.0, max_features=None, ngram_range=(1, 1))
data = pd.DataFrame(binary.fit_transform(corpus).toarray(), columns=binary.get_feature_names_out())

In [4]:
data

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Bag of words (BoW) captures the total frequency of a given word in a corpus
bow = TfidfVectorizer(binary=False, norm=None, use_idf=False, smooth_idf=False, lowercase=True, stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b', min_df=1, max_df=1.0, max_features=None, ngram_range=(1, 1))
data = pd.DataFrame(bow.fit_transform(corpus).toarray(), columns=bow.get_feature_names_out())

In [6]:
data

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Normalized frequency term - captures the count_word / total_words_in_document
# where document is only an element it could be a sentece or a text not in the
# whole corpus!
norm_freq = TfidfVectorizer(binary=False, norm='l1', use_idf=False, smooth_idf=False, lowercase=True, stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b', min_df=1, max_df=1.0, max_features=None, ngram_range=(1, 1))
data = pd.DataFrame(norm_freq.fit_transform(corpus).toarray(), columns=norm_freq.get_feature_names_out())

In [8]:
data

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.166667,0.166667,0.0,0.333333,0.0,0.166667,0.0,0.0,0.166667
1,0.166667,0.0,0.0,0.0,0.333333,0.0,0.166667,0.0,0.166667,0.166667
2,0.0,0.0,0.2,0.4,0.0,0.2,0.0,0.2,0.0,0.0
3,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# TF-IDF (Term Frequency-Inverse Document Frequency) scores words based on their importance in a document relative to a corpus.
# TF: Measures how often a term appears in a document.
# IDF: Reduces the weight of common terms across documents to highlight unique ones.
tfidf = TfidfVectorizer(binary=False, norm='l2', use_idf=False, smooth_idf=False, lowercase=True, stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b', min_df=1, max_df=1.0, max_features=None, ngram_range=(1, 1))
data = pd.DataFrame(tfidf.fit_transform(corpus).toarray(), columns=tfidf.get_feature_names_out())

In [10]:
data

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.353553,0.353553,0.0,0.707107,0.0,0.353553,0.0,0.0,0.353553
1,0.353553,0.0,0.0,0.0,0.707107,0.0,0.353553,0.0,0.353553,0.353553
2,0.0,0.0,0.377964,0.755929,0.0,0.377964,0.0,0.377964,0.0,0.0
3,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


# Text Classification

In [11]:
!gdown 1rmX4GzVy9kKzwPjtaC0WYR34iYmb7Beu

Downloading...
From: https://drive.google.com/uc?id=1rmX4GzVy9kKzwPjtaC0WYR34iYmb7Beu
To: /content/SPAM text message 20170820 - Data.csv
  0% 0.00/486k [00:00<?, ?B/s]100% 486k/486k [00:00<00:00, 117MB/s]


In [12]:
data = pd.read_csv('SPAM text message 20170820 - Data.csv')

In [13]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [14]:
trainX, testX, trainY, testY = train_test_split(data['Message'],  data['Category'], test_size=0.2)

In [15]:
print(f"Training class distributions summary: {Counter(trainY)}")
print(f"Test class distributions summary: {Counter(testY)}")

Training class distributions summary: Counter({'ham': 3872, 'spam': 585})
Test class distributions summary: Counter({'ham': 953, 'spam': 162})


In [16]:
# Creating the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [17]:
model.fit(trainX, trainY)

In [18]:
ypred = model.predict(testX)

In [19]:
print(classification_report_imbalanced(testY, ypred))

                   pre       rec       spe        f1       geo       iba       sup

        ham       0.95      1.00      0.72      0.98      0.85      0.74       953
       spam       1.00      0.72      1.00      0.83      0.85      0.70       162

avg / total       0.96      0.96      0.76      0.96      0.85      0.73      1115



In [20]:
model_balanced_dataset = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())

In [23]:
# i am going to downsample the dataset by reducing the number of rows that have the majority class, in a random way the rows will be chosen for removal.
model_balanced_dataset.fit(trainX, trainY)
ypred = model_balanced_dataset.predict(testX)
print(classification_report_imbalanced(testY, ypred))

                   pre       rec       spe        f1       geo       iba       sup

        ham       0.99      0.96      0.97      0.97      0.96      0.93       953
       spam       0.79      0.97      0.96      0.87      0.96      0.93       162

avg / total       0.96      0.96      0.97      0.96      0.96      0.93      1115

