#**Installing require Tools**

In [4]:
pip install gradio



#**Import the Libraries**

In [5]:
## import some basic libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import gradio as gr

#**Data Collection and Preprocessing**

In [6]:
# Loading the dataset into a pandas DataFrame
raw_mail_dataset = pd.read_csv('mail_data.csv')  # Use this to identify issues

In [7]:
# Display the  rows
print("Rows of the dataset:")
raw_mail_dataset

Rows of the dataset:


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
# Checking the shape of the dataset
raw_mail_dataset.shape    # Outputs the number of rows and columns in the dataset.

(5572, 2)

In [9]:
# Replace the null values with a null string
mail_dataset = raw_mail_dataset.where((pd.notnull(raw_mail_dataset)), '')

In [10]:
# Display the first few rows
print("First 5 rows of the dataset:")
raw_mail_dataset.head()

First 5 rows of the dataset:


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# Check for missing values
missing_values = raw_mail_dataset.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)


Missing values in each column:
Category    0
Message     0
dtype: int64


**Encoding the Categorical column**

* **Span mail --->  0**
* **Ham mail  --->  1**


In [12]:
# Converting the text data into numerical values
mail_dataset.loc[mail_dataset['Category'] == 'spam', 'Category'] = 0
mail_dataset.loc[mail_dataset['Category'] == 'ham', 'Category'] = 1

**Splitting the dataset into Text & Target**

In [13]:
# Splitting the dataset into Text & Target
X = mail_dataset['Message']
y = mail_dataset['Category']

In [14]:
print("Text (X):")
print(X.head())
print("\nTarget (y):")
print(y.head())

Text (X):
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

Target (y):
0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object


# **Splitting the dataset into Training and Test sets**

In [15]:
# Splitting the dataset into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
print("\nDataset split completed:")
print(f"Total samples: {X.shape[0]}, Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")


Dataset split completed:
Total samples: 5572, Training samples: 4179, Test samples: 1393


In [16]:
# checking the number of Test and Train dataset
print(X.shape, X_train.shape, X_test.shape)

(5572,) (4179,) (1393,)


**Feature Extraction**

In [17]:
# Initialize TfidfVectorizer and transform the text data to features vector that can be used as input to the Logistic Regression
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

# Fit on training data and transform training data
X_train_features = feature_extraction.fit_transform(X_train)

# Transform the test data using the training data's parameters
X_test_features = feature_extraction.transform(X_test)

# Convert y_train and y_test to integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [18]:
print(f"Training features:\n {X_train_features}")

Training features:
   (0, 3159)	0.40224633321456266
  (0, 3217)	0.4076845224125937
  (0, 2947)	0.34881124809412434
  (0, 2013)	0.35409440010605225
  (0, 1220)	0.5031131815237931
  (0, 4430)	0.41450719317548096
  (1, 3046)	0.44455340400970733
  (1, 4548)	0.4970267673802078
  (1, 6429)	0.3426231345185736
  (1, 6893)	0.4663318175217499
  (1, 3775)	0.4695537106130582
  (2, 1705)	0.400124925970091
  (2, 2544)	0.42459495566937844
  (2, 6719)	0.47989357686704015
  (2, 4851)	0.47989357686704015
  (2, 6598)	0.4461204736403209
  (3, 1584)	0.22650362164633256
  (3, 2476)	0.31787362094378147
  (3, 5859)	0.20421601259224467
  (3, 6753)	0.3029646718908487
  (3, 5011)	0.3029646718908487
  (3, 5995)	0.31787362094378147
  (3, 4792)	0.17630372185186377
  (3, 4497)	0.18714349878479064
  (3, 2401)	0.31787362094378147
  :	:
  (4176, 2744)	0.4713828729564554
  (4176, 2154)	0.4713828729564554
  (4177, 3775)	0.31339019758210673
  (4177, 5868)	0.22826489568398323
  (4177, 6663)	0.1968309422132608
  (4177, 4288

In [19]:
print("Feature extraction completed:")
print(f"Training features shape: {X_train_features.shape}, Test features shape: {X_test_features.shape}")

Feature extraction completed:
Training features shape: (4179, 7166), Test features shape: (1393, 7166)


# **Model Training**

In [20]:
# Initialize the model
logReg_model = LogisticRegression()

In [21]:
# Training the LogisticRegression model with train data
logReg_model.fit(X_train_features, y_train)

#**Model Evaluation**

In [22]:
# Calculate accuracy on the training data
X_train_pred = logReg_model.predict(X_train_features)
train_data_accuracy = accuracy_score(X_train_pred, y_train)
print('Accuracy on training data : ', train_data_accuracy)

Accuracy on training data :  0.9679349126585307


In [23]:
# Generate and display the confusion matrix on the training data
# The confusion matrix shows the counts of True Positives, True Negatives, False Positives, and False Negatives
conf_matrix = confusion_matrix(y_train, X_train_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[ 420  131]
 [   3 3625]]


In [24]:
# Generate and display the classification report
# The classification report includes precision, recall, F1-score, and support for each class
class_report = classification_report(y_train, X_train_pred, target_names=["Spam Mail (0)", "Ham Mail (1)"])
print("\nClassification Report:")
print(class_report)


Classification Report:
               precision    recall  f1-score   support

Spam Mail (0)       0.99      0.76      0.86       551
 Ham Mail (1)       0.97      1.00      0.98      3628

     accuracy                           0.97      4179
    macro avg       0.98      0.88      0.92      4179
 weighted avg       0.97      0.97      0.97      4179



In [25]:
# accuracy on the test data
X_test_pred = logReg_model.predict(X_test_features)
test_data_accuracy = accuracy_score(X_test_pred, y_test)
print('Accuracy on test data : ', test_data_accuracy)

Accuracy on test data :  0.964824120603015


In [26]:
# Generate and display the confusion matrix on the test data
# The confusion matrix shows the counts of True Positives, True Negatives, False Positives, and False Negatives
conf_matrix = confusion_matrix(y_test, X_test_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[ 147   49]
 [   0 1197]]


In [27]:
# Generate and display the classification report
# The classification report includes precision, recall, F1-score, and support for each class
class_report = classification_report(y_test, X_test_pred, target_names=["Spam Mail (0)", "Ham Mail (1)"])
print("\nClassification Report:")
print(class_report)


Classification Report:
               precision    recall  f1-score   support

Spam Mail (0)       1.00      0.75      0.86       196
 Ham Mail (1)       0.96      1.00      0.98      1197

     accuracy                           0.96      1393
    macro avg       0.98      0.88      0.92      1393
 weighted avg       0.97      0.96      0.96      1393



# **Making a Predictive System**

In [28]:
# Sample input data (new instance to classify)
input_mail = ["Even my brother is not like to speak with me. They treat me like aids patent."]

# Converting text to a feature vectors
input_data_features = feature_extraction.transform(input_mail)



# Making a prediction
prediction = logReg_model.predict(input_data_features)
print(prediction)

# Outputting the result
if prediction[0] == 0:
    print("The Mail is Fake  (i.e. Spam Mail)")
else:
    print("The Mail is Real  (i.e. Ham Mail)")


[1]
The Mail is Real  (i.e. Ham Mail)


In [29]:
# Sample input data (new instance to classify)
input_mail = ["Congratulations! You've won a free ticket."]

# Converting text to a feature vector
input_data_features = feature_extraction.transform(input_mail)

# Making a prediction
prediction = logReg_model.predict(input_data_features)
print(prediction)

# Outputting the result
if prediction[0] == 0:
    print("The Mail is Fake  (i.e. Spam Mail)")
else:
    print("The Mail is Real  (i.e. Ham Mail)")

[1]
The Mail is Real  (i.e. Ham Mail)


In [30]:
# Sample input data (new instance to classify)
input_mail = ["18 days to Euro2004 kickoff! U will be kept informed of all the latest news and results daily. Unsubscribe send GET EURO STOP to 83222."]

# Converting text to a feature vector
input_data_features = feature_extraction.transform(input_mail)

# Making a prediction
prediction = logReg_model.predict(input_data_features)
print(prediction)

# Outputting the result
if prediction[0] == 0:
    print("The Mail is Fake  (i.e. Spam Mail)")
else:
    print("The Mail is Real  (i.e. Ham Mail)")

[0]
The Mail is Fake  (i.e. Spam Mail)


In [41]:
# Ensure the feature extractor and model are loaded
def predict_spam_mail(input_mail):
    """
    Predicts whether an email is spam or ham using a trained Logistic Regression model.

    Parameters:
        input_mail (list): A list containing a single email message.

    Returns:
        str: Prediction result ("Spam" or "Ham").
    """

    # Ensure input is a non-empty list
    if not input_mail or input_mail[0].strip() == "":
        return "Error: Please enter a valid message."

    # Convert the input into a feature vector using the trained TfidfVectorizer
    input_data_features = feature_extraction.transform(input_mail)

    # Make the prediction using the trained Logistic Regression model
    prediction = logReg_model.predict(input_data_features)

    # Interpret and return the result
    if prediction[0] == 0:
        return "The Mail is Fake  (i.e. Spam Mail)"
    else:
        return "The Mail is Real  (i.e. Ham Mail)"

In [42]:
def gradio_predict(Message):
    """
    Takes user input from Gradio and predicts if the message is Spam or Ham.
    """
    # Convert input to a list (required format for TfidfVectorizer)
    input_mail = [str(Message)]

    # Call the prediction function
    return predict_spam_mail(input_mail)

# Set up the Gradio interface
interface = gr.Interface(
    fn=gradio_predict,
    inputs=gr.Textbox(lines=2, placeholder="Enter your message here...", label="Message"),
    outputs=gr.Textbox(label="Prediction Result"),
    title="Spam Mail Prediction",
    description="Enter the required Mail to predict if a message is Spam or Ham."
)

# Launch the interface
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0c549bd4c83e065b2b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


