<a href="https://colab.research.google.com/github/Zeaxanthin80/CAI2300C/blob/main/CAI2300C_20250221_Week_07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and Load the data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# %matplotlib inline

In [3]:
# df: This is your Pandas DataFrame, which is like a table holding your data.
df = pd.read_csv("https://github.com/fenago/datasets/raw/main/SMSSpamCollection.txt", sep ='\t', names=['label', 'message'])
# The head() function is used to display the first few rows of the DataFrame.
# By default, it shows the top 5 rows.
df.head()


# for classification in an AI Model for NLP - it requires a label and a text column
# if your dataset has multiple columns - then drop the columns that are not text based

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# len(): This is a built-in Python function that returns the length of an object.
# When used with a DataFrame, it returns the number of rows.
len(df)

5572

In [5]:
# This specific line of code is used to check for missing values in the dataset.
# isnull(): This function checks each cell in the DataFrame.
# If a cell is empty or contains a special value like NaN (Not a Number), it's marked as True, otherwise False.
# sum(): This function then adds up all the True values for each column.
# The result shows how many missing values are present in each column of your DataFrame.
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


In [8]:
# df['message']: This part selects the entire 'message' column from the DataFrame.
# It isolates all the text messages in the dataset.
# .unique(): This is a Pandas function.
# When applied to a column (like df['message']), it finds and returns all the unique (distinct) values present in that column.
unique_messages = df['message'].unique()
# This for loop is to print only the first 10 unique values.
for i, message in enumerate(unique_messages):
    if i < 10:  # Print only if the index is less than 10
        print(f"{i + 1}. {message}")
    else:
        break  # Stop the loop after printing 10 messages

1. Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
2. Ok lar... Joking wif u oni...
3. Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
4. U dun say so early hor... U c already then say...
5. Nah I don't think he goes to usf, he lives around here though
6. FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
7. Even my brother is not like to speak with me. They treat me like aids patent.
8. As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
9. WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
10. Had your mobil

In [None]:
# .value_counts(): This is a function that is applied to the selected 'message' column.
# It looks at all the unique values (distinct messages) in the column.
# It then counts how many times each unique value appears in the column.
# Finally, it returns a new data structure (a Pandas Series).
# In simpler terms: This line of code tells you how many times each distinct message appears in your dataset.
df['message'].value_counts()

Unnamed: 0_level_0,count
message,Unnamed: 1_level_1
"Sorry, I'll call later",30
I cant pick the phone right now. Pls send a message,12
Ok...,10
Okie,4
Your opinion about me? 1. Over 2. Jada 3. Kusruthi 4. Lovable 5. Silent 6. Spl character 7. Not matured 8. Stylish 9. Simple Pls reply..,4
...,...
No. On the way home. So if not for the long dry spell the season would have been over,1
Urgent! Please call 09061743811 from landline. Your ABTA complimentary 4* Tenerife Holiday or £5000 cash await collection SAE T&Cs Box 326 CW25WX 150ppm,1
Dear 0776xxxxxxx U've been invited to XCHAT. This is our final attempt to contact u! Txt CHAT to 86688 150p/MsgrcvdHG/Suite342/2Lands/Row/W1J6HL LDN 18yrs,1
I think asking for a gym is the excuse for lazy people. I jog.,1


# Clean the data

In [None]:
# This is the minimum. You can obviously clean the data more than this if you choose.
# I would consider using Lemma's... but it is your choice

import re # re: This library is used for regular expressions, which are helpful for pattern matching in text.
import nltk # nltk: The Natural Language Toolkit (NLTK) is a powerful library for working with human language data.

# nltk.download(): This function is used to download NLTK data, including corpora, models, and other resources.
#'stopwords': This argument specifies that we want to download the "stopwords" dataset, which contains lists of stop words for different languages.
# Once downloaded, you can access and use the stop words lists through the nltk.corpus.stopwords module.
nltk.download('stopwords')

from nltk.corpus import stopwords # This module from NLTK provides a list of common words (like "the," "a," "is") that are often removed from text data as they don't usually carry much meaning.
from nltk.stem.porter import PorterStemmer # This is a tool from NLTK used for stemming, which reduces words to their root form (e.g., "running" becomes "run").

ps = PorterStemmer() # Creates an instance of the PorterStemmer for use later.
corpus = [] # An empty list called corpus is created to store the cleaned text data.

# The code then enters a loop that iterates through each message in the 'message' column of the DataFrame df.
for i in range(0, len(df)):
    # This line uses a regular expression to remove any characters that are not letters (a-z, A-Z) and replaces them with spaces.
    # This helps clean up punctuation and special characters.
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])

    review = review.lower() # Converts the entire message to lowercase
    review = review.split() # Splits the message into a list of individual words.

    # This is a list comprehension that does two things:
    # It uses the PorterStemmer (ps) to reduce each word to its root form (stemming).
    # It also checks if the word is in the list of English stop words and removes it if it is.
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review) # Joins the stemmed and filtered words back into a single string.
    corpus.append(review) # Adds the cleaned and processed message (now in the review variable) to the corpus list.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# This line of code is used to display the first 5 elements of the corpus list.
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

# Train Test Split

In [None]:
# This line creates a new variable called X and assigns it the values from the 'message' column of the DataFrame (df).
X = df['message']
# This line creates a variable y and assigns it the values from the 'label' column of the DataFrame (df).
y = df['label']

# Features (X): The text data that the model will learn from.
# Target (y): The labels or categories that I want the model to predic

# train_test_split: This is a function from the sklearn.model_selection module.
# It's a very common practice in machine learning to split your data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
# X, y: These are the feature and target data defined earlier.
# test_size = 0.30: This means 30% of the data will be used for testing, and the remaining 70% for training.
# random_state = 42: This ensures that the split is reproducible. If you run this code again with the same random_state, you'll get the same split.

'''
Output Variables:

  ● X_train: The training data (features).
  ● X_test: The testing data (features).
  ● y_train: The training data (labels).
  ● y_test: The testing data (labels).
'''

X_train.shape # This line simply prints the dimensions of the X_train dataset.

(3900,)

# Transform the text into Vectors (numbers)

In [None]:
# CountVectorizer: Converts the text in a matrix of token counts... creates the Bag of Words (BoW) - the count of each word
# TfidfTransformer: Coverting those counts (from the BoW) - into a score for each word.  So each word is represented by a number.

'''
This part is all about preparing the text data for the machine learning model.
Machine learning models work with numbers, not raw text.
So, we need to convert the text messages into a numerical format that the model can understand.
This process is often called feature extraction or vectorization.
'''
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
X_train_counts =count_vect.fit_transform(X_train)

print("Shape of count vectorizer", X_train_counts.shape)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf =tfidf_transformer.fit_transform(X_train_counts)

print("Shape of tfidf feature extraction",X_train_tfidf.shape)

Shape of count vectorizer (3900, 7263)
Shape of tfidf feature extraction (3900, 7263)


# The Data is now primed... we simply need to put the data into an Algorithm and create a model

In [None]:
from sklearn.linear_model import LogisticRegression
# clf variable is used instead of the whole word classifier
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),])
text_clf.fit(X_train, y_train)

In [None]:
predictions = text_clf.predict(X_test)

In [None]:
from sklearn import metrics
print("Confusion Metrics\n",metrics.confusion_matrix(y_test,predictions), end="\n\n\n")

print("Classification Report\n",metrics.classification_report(y_test,predictions), end="\n\n\n")

print("Accuracy Score:", metrics.accuracy_score(y_test,predictions))

Confusion Metrics
 [[1446    2]
 [  45  179]]


Classification Report
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1448
        spam       0.99      0.80      0.88       224

    accuracy                           0.97      1672
   macro avg       0.98      0.90      0.93      1672
weighted avg       0.97      0.97      0.97      1672



Accuracy Score: 0.97188995215311


In [None]:
def predict_text(model, text):
    """
    Predict the class of the given text using the trained model.

    :param model: The trained text classification model (pipeline).
    :param text: A string containing the text to be classified.
    :return: The predicted class of the text.
    """
    prediction = model.predict([text])
    return prediction[0]

# Example usage
your_text = "Your sample text here"
prediction = predict_text(text_clf, your_text)
print("Predicted class:", prediction)

Predicted class: spam


In [None]:
def predict_text_with_score(model, text):
    """
    Predict the class of the given text using the trained model and provide the probability scores.

    :param model: The trained text classification model (pipeline).
    :param text: A string containing the text to be classified.
    :return: The predicted class of the text and the probability scores.
    """
    prediction = model.predict([text])
    prediction_proba = model.predict_proba([text])

    # Getting the class labels
    class_labels = model.classes_

    # Formatting the probability scores along with the class labels
    proba_scores = {class_labels[i]: prediction_proba[0][i] for i in range(len(class_labels))}

    return prediction[0], proba_scores

# Example usage
your_text = "Your sample text here"
predicted_class, scores = predict_text_with_score(text_clf, your_text)
print("Predicted class:", predicted_class)
print("Probability Scores:", scores)

Predicted class: spam
Probability Scores: {'ham': 0.4575389633637499, 'spam': 0.5424610366362501}


In [None]:
import joblib

# Save your model
joblib.dump(text_clf, 'text_clf_model.joblib')

['text_clf_model.joblib']

In [None]:
from sklearn.svm import SVC
lr_model = SVC(gamma='auto')


SVC_text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', SVC(gamma='auto')),])
SVC_text_clf.fit(X_train, y_train)

In [None]:
SVC_predictions = SVC_text_clf.predict(X_test)

In [None]:
from sklearn import metrics
print("Confusion Metrics\n",metrics.confusion_matrix(y_test,SVC_predictions), end="\n\n\n")

print("Classification Report\n",metrics.classification_report(y_test,SVC_predictions), end="\n\n\n")

print("Accuracy Score:", metrics.accuracy_score(y_test,SVC_predictions))

Confusion Metrics
 [[1448    0]
 [ 224    0]]


Classification Report
               precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1448
        spam       0.00      0.00      0.00       224

    accuracy                           0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672



Accuracy Score: 0.8660287081339713


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,3377
spam,523


In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,1448
spam,224


# Let's try this again with a new dataset

In [None]:
# https://github.com/fenago/datasets/raw/refs/heads/main/twitter_training.csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# %matplotlib inline
# df = pd.read_csv("https://github.com/fenago/datasets/raw/refs/heads/main/twitter_training.csv")
# df = pd.read_csv("https://github.com/fenago/datasets/raw/refs/heads/main/twitter_training.csv", names=['label', 'message'])
# df.head()

df = pd.read_csv(
    "https://github.com/fenago/datasets/raw/refs/heads/main/twitter_training.csv",
    header=None,          # since the file has no header row
    usecols=[2, 3],       # read only columns #2 and #3
    names=["label", "message"]  # rename them for convenience
)

df.head()

Unnamed: 0,label,message
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
df.sample(10)

Unnamed: 0,label,message
12015,Negative,Pretty good Morning opened 3 singles I got the...
24509,Neutral,This means if I did a review defending you rel...
40697,Negative,Friends: go buy Battlefield 1 to his real team...
25083,Neutral,Check out another review of Lopez in Sanchez o...
516,Neutral,. :: Ah yes. A very very old image of demon Lo...
46689,Positive,i’ve never related so closely
32712,Positive,I'm going to suck you dry~ FORTNITE I'm twitch...
36662,Negative,"Last year, hackers from national states broke ..."
35889,Neutral,The latest... rMVP Microsoft Daily! paper.li/r...
73445,Negative,Latest Nvidia driver rendered photoshop 2019 u...


In [None]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Negative,22542
Positive,20832
Neutral,18318
Irrelevant,12990


In [None]:
df = df[df['label'].isin(['Positive', 'Negative'])]
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Negative,22542
Positive,20832


In [None]:
len(df)

43374

In [None]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,361


In [None]:
df = df.dropna(subset=["message"])
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


In [None]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Negative,22358
Positive,20655


In [None]:
df['message'].unique()

array(['im getting on borderlands and i will murder you all ,',
       'I am coming to the borders and I will kill you all,',
       'im getting on borderlands and i will kill you all,', ...,
       'Just realized the windows partition of my Mac is now 6 years behind on Nvidia drivers and I have no idea how he didn’t notice',
       'Just realized between the windows partition of my Mac is like being 6 years behind on Nvidia drivers and cars I have no fucking idea how I ever didn ’ t notice',
       'Just like the windows partition of my Mac is like 6 years behind on its drivers So you have no idea how I didn’t notice'],
      dtype=object)

In [None]:
df['message'].value_counts()

Unnamed: 0_level_0,count
message,Unnamed: 1_level_1
"At the same time, despite the fact that there are currently some 100 million people living below the poverty line, most of them do not have access to health services and do not have access to health care, while most of them do not have access to health care.",82
,82
It is not the first time that the EU Commission has taken such a step.,82
<unk>,77
Wow,48
...,...
"How Ubisoft is announcing the new Assassin's Creed right now is right strange, but it makes me excited",1
The way Ubisoft is announcing this new Assassin’s Creed right now is proper weird but it’s getting people excited,1
The way down Ubisoft is announcing that the new Assassin ’ s Creed outfit right now is proper... weird but it ’ s getting me all excited,1
The way Ubisoft keeps announcing the new Assassin’s Unity system now is proper weird but it’s keeping me excited,1


In [None]:
df.head(15)

Unnamed: 0,label,message
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
5,Positive,im getting into borderlands and i can murder y...
6,Positive,So I spent a few hours making something for fu...
7,Positive,So I spent a couple of hours doing something f...
8,Positive,So I spent a few hours doing something for fun...
9,Positive,So I spent a few hours making something for fu...


In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
# Instead of iterating using an index, iterate through each row of the dataframe
for index, row in df.iterrows():
    review = re.sub('[^a-zA-Z]', ' ', row['message'])  # Access message from the row
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
corpus[:5]

['im get borderland murder',
 'come border kill',
 'im get borderland kill',
 'im come borderland murder',
 'im get borderland murder']

In [None]:
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train.shape

(30109,)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
X_train_counts =count_vect.fit_transform(X_train)

print("Shape of count vectorizer", X_train_counts.shape)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf =tfidf_transformer.fit_transform(X_train_counts)

print("Shape of tfidf feature extraction",X_train_tfidf.shape)

Shape of count vectorizer (30109, 17952)
Shape of tfidf feature extraction (30109, 17952)


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),])
text_clf.fit(X_train, y_train)

In [None]:
predictions = text_clf.predict(X_test)

In [None]:
from sklearn import metrics
print("Confusion Metrics\n",metrics.confusion_matrix(y_test,predictions), end="\n\n\n")

print("Classification Report\n",metrics.classification_report(y_test,predictions), end="\n\n\n")

print("Accuracy Score:", metrics.accuracy_score(y_test,predictions))

Confusion Metrics
 [[5991  670]
 [ 803 5440]]


Classification Report
               precision    recall  f1-score   support

    Negative       0.88      0.90      0.89      6661
    Positive       0.89      0.87      0.88      6243

    accuracy                           0.89     12904
   macro avg       0.89      0.89      0.89     12904
weighted avg       0.89      0.89      0.89     12904



Accuracy Score: 0.8858493490390577
