<a href="https://colab.research.google.com/github/anuradha-datascience/NLP/blob/main/HandsOnNLPWithConcept_Part3__tfidfsklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [119]:
import pandas as pd
import chardet
with open('SMSSpamCollection.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']


In [120]:
df=pd.read_csv('SMSSpamCollection.csv',sep="\t",names=["label","message"],encoding=encoding)

In [121]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [122]:
df.shape

(5572, 2)

In [123]:
df['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."


**Data Cleaning and Preprocessing**


1.   Text Preprocessing
  *   Tokenization
  *   Stop Words
  *   Stemming
  *   POS Tagging
  *   Lemmetization
  *   NLTK Library

2.   Word Embeddings - Feature Extraction

Frequency-based Embedding
  *   BOW
  *   TFIDF
  *   Glove

Prediction-based Embedding
  *   Word2Vec
  *   AvgWord2Vec
  *   BERT






In [124]:

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#download resources

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [125]:
# Get the English stopwords list
stopwords_list = stopwords.words('english')


In [126]:
# remove punctuations and stop words
stop_words = stopwords.words('english') + list(string.punctuation)


In [127]:
# let's make a single function for preprocessing using nltk
def get_wordnet_pos(word):
  """Map POS tag to first character lemmatize() accepts"""
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

def preprocess_data(text):
  #tokenizing
  preprocess_tokens=word_tokenize(text)

  #stop word| lowering | punctuation| only alpha
  stopwords_list = stopwords.words('english')
  preprocess_tokens = [word.lower() for word in preprocess_tokens if word.lower() not in stop_words]
  preprocess_tokens=[token for token in preprocess_tokens if token.isalpha()]

  # Lemmatization with POS tagging
  lemmatizer = WordNetLemmatizer()
  preprocess_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in preprocess_tokens]

  # Join the tokens back into a single string
  preprocessed_text = " ".join(preprocess_tokens)
  return preprocessed_text

df=pd.read_csv('SMSSpamCollection.csv',sep="\t",names=["label","message"],encoding=encoding)

df["message_preprocess"]=df['message'].apply(lambda x: preprocess_data(x))

In [128]:
df=df.drop(['message'],axis=1)

In [129]:
df.head()


Unnamed: 0,label,message_preprocess
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts may...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


Identify Target and Independent Variables

In [130]:

# Splitting the data into features (X) and target variable (y)
y = df[['label']]
X = df[['message_preprocess']]


Perform data split into train and test data

In [131]:
X.head()

Unnamed: 0,message_preprocess
0,go jurong point crazy available bugis n great ...
1,ok lar joking wif u oni
2,free entry wkly comp win fa cup final tkts may...
3,u dun say early hor u c already say
4,nah think go usf life around though


In [132]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=101)

Feature Extraction using TFIDF

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus (collection of documents)
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Print the shape of TF-IDF matrix
print("Shape of TF-IDF matrix:", tfidf_matrix.shape)
print(tfidf_matrix.toarray())
# Convert the sparse matrix X_train_tfidf to a dense matrix and then to a DataFrame
tfidf_matrix_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Print the DataFrame
print(tfidf_matrix_df)

Shape of TF-IDF matrix: (4, 9)
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


In [134]:
X_train.head()

Unnamed: 0,message_preprocess
1398,wait u lor need feel bad lar
1084,wat make people dearer de happiness dat u feel...
1086,like needle clock though v r clock v r nt able...
2655,sarcasm scarcasim
1059,tell female v throw decide get


In [140]:
#Importing the tfidf library
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer object
# vectorizer = TfidfVectorizer(max_features=20)
vectorizer = TfidfVectorizer(max_features=2000)

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train['message_preprocess'])

In [141]:
# Print the shape of TF-IDF matrix
print("Shape of TF-IDF matrix:", X_train_tfidf.shape)
print("Shape of y:", y_train.shape)
# print(X_train_tfidf.toarray())

# Convert the sparse matrix X_train_tfidf to a dense matrix and then to a DataFrame
tfidf_matrix_df = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Print the DataFrame
# print(tfidf_matrix_df)

Shape of TF-IDF matrix: (4457, 2000)
Shape of y: (4457, 1)


In [145]:

# Transform the test data (using the same vectorizer fitted on the training data)
X_test_tfidf = vectorizer.transform(X_test['message_preprocess'])

Prediction - Binary Classification Problem

- ML Algo Used

 -- Logistic Regression

 -- SVC

 -- Random Forest



In [149]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Create a logistic regression model
logistic_model = LogisticRegression()

# Fit the model on the training data
logistic_model.fit(X_train_tfidf, y_train)

# # Make predictions on the test data
y_pred = logistic_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

  y = column_or_1d(y, warn=True)


Accuracy: 0.95695067264574
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       980
        spam       0.97      0.67      0.79       135

    accuracy                           0.96      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.96      0.96      0.95      1115



In [152]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


# Create Support Vector Classifier (SVC) model
svc_model = SVC()

# Fit the SVC model on the training data
svc_model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
svc_y_pred = svc_model.predict(X_test_tfidf)
# Evaluate the model
accuracy = accuracy_score(y_test, svc_y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, svc_y_pred))



  y = column_or_1d(y, warn=True)


Accuracy: 0.9739910313901345
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       980
        spam       0.98      0.80      0.88       135

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [153]:
from sklearn.ensemble import RandomForestClassifier

# Create Random Forest Classifier model
rf_model = RandomForestClassifier()

# Fit the Random Forest model on the training data
rf_model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
rf_y_pred = rf_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, rf_y_pred))

  rf_model.fit(X_train_tfidf, y_train)


Accuracy: 0.9748878923766816
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       980
        spam       0.98      0.81      0.89       135

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.98      0.97      0.97      1115

