##1 Lab Experiment: Text Preprocessing
Objective:
Understand and implement text preprocessing techniques, including tokenization, stemming, lemmatization, stop words removal, and punctuation handling.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import spacy

In [2]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [14]:
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text)
     # Remove punctuation
    words = [word for word in words if word.isalnum()]
    # Convert to lower case
    words = [word.lower() for word in words]
     # Remove stop words
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
      # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
     # Lemmatization
    doc = nlp(" ".join(stemmed_words))
    lemmatized_words = [token.lemma_ for token in doc]

    return lemmatized_words

In [15]:
# Sample text
text = "Hello, world! Welcome to the world of NLP."

In [16]:
# Preprocess the text
preprocessed_text = preprocess_text(text)
print("Preprocessed Text:", preprocessed_text)

Preprocessed Text: ['hello', 'world', 'welcom', 'world', 'nlp']


##2 Lab Experiment: Text Classification
Objective:
Classify text data into predefined categories.

Experiments:
Sentiment Analysis:

Perform sentiment analysis on text data using Naive Bayes and Support Vector Machines (SVM).
Spam Detection:

Detect spam messages using Logistic Regression and Decision Trees.

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [18]:
from sklearn.linear_model import LogisticRegression

# Sample dataset (replace with actual dataset)
data = {
    'text': ["Free entry in 2 a weekly competition to win", "Your loan application has been approved", "Win cash prizes now", "Hello, how are you?"],
    'label': ["spam", "not spam", "spam", "not spam"]
}

In [19]:
df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,Free entry in 2 a weekly competition to win,spam
1,Your loan application has been approved,not spam
2,Win cash prizes now,spam
3,"Hello, how are you?",not spam


In [21]:
# Preprocess the data
X = df['text']
y = df['label']
print(X)
print(y)

0    Free entry in 2 a weekly competition to win
1        Your loan application has been approved
2                            Win cash prizes now
3                            Hello, how are you?
Name: text, dtype: object
0        spam
1    not spam
2        spam
3    not spam
Name: label, dtype: object


In [22]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [24]:
# Train a logistic regression classifier
logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train_vec, y_train)

In [25]:
# Predict on the test set
y_pred = logreg_classifier.predict(X_test_vec)

In [26]:
# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

    not spam       0.00      0.00      0.00       1.0
        spam       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
from sklearn.tree import DecisionTreeClassifier

# Train a decision tree classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = dt_classifier.predict(X_test_vec)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

    not spam       0.00      0.00      0.00       1.0
        spam       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
