### 1. Function to extract content from file

In [47]:
import email
import os

def extract_content(msg):
    content = None

    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition"))

            # Ignore any attachments
            if content_type == "text/plain" and "attachment" not in content_disposition:
                content = part.get_payload(decode=True).decode(part.get_content_charset())
                break
    else:
        content = msg.get_payload(decode=True).decode(msg.get_content_charset())

    return content

def get_email_content(eml_file_path):
    with open(eml_file_path, 'r') as eml_file:
        msg = email.message_from_file(eml_file)

        # Extract email content
        content = extract_content(msg)
        return content

# Example usage
eml_file_path = '../Example_emails/email_1.eml'
if os.path.exists(eml_file_path):
    email_content = get_email_content(eml_file_path)
    if email_content:
        print("Email Content:")
        print(email_content)
    else:
        print("No email content found.")
else:
    print("File not found.")


Email Content:
This is the link: Displayedlinl.com <https://www.wikihow.com/Main-Page>

This is the txt attachment:

This is an image !
[image: how_emails_work.png]



### 2. Languaged based Analysis - identifying spelling mistakes, generic greetings...

In [48]:
# pip install textblob
# pip install word_tokenize
# pip install nltk
# nltk.download('punkt')

In [49]:
import nltk
from textblob import TextBlob
import re
from nltk.tokenize import word_tokenize

def analyze_text(text):
    # Initialize dictionary to store results
    analysis_results = {
        "Generic Greetings Detected": [],
        "Spelling Mistakes Detected": [],
        "Urgent Language Occurrence Detected": [],
        "Personal Information Inquiry Detected": []
    }

    # Detecting generic greetings
    generic_greetings = ['hello', 'hi', 'hey', 'good morning', 'good afternoon', 'good evening']
    greetings_detected = [word for word in word_tokenize(text.lower()) if word in generic_greetings]
    analysis_results["Generic Greetings Detected"] = greetings_detected

    # Finding spelling and grammar mistakes
    blob = TextBlob(text)
    spelling_mistakes = [word for word in blob.words if word.lower() != TextBlob(word).correct()]
    analysis_results["Spelling Mistakes Detected"] = spelling_mistakes

    # Finding the occurrence of urgent language
    urgent_keywords = ['urgent', 'emergency', 'important', 'asap', 'as soon as possible']
    urgent_occurrence = [keyword for keyword in urgent_keywords if keyword in text.lower()]
    analysis_results["Urgent Language Occurrence Detected"] = urgent_occurrence

    # Finding if the text asks personal information
    personal_info_regex = r'\b(?:name|address|phone|email|social security|credit card|password)\b'
    personal_info_detected = re.findall(personal_info_regex, text.lower())
    analysis_results["Personal Information Inquiry Detected"] = personal_info_detected

    return analysis_results

# Example usage
text = """
Hello there! We hope you're doing well.
This is an urgent message regarding your account.
Please provide your name, address, and phone number as soon as possible.
Thank you and have a great day!
"""
results = analyze_text(text)
print(results)


{'Generic Greetings Detected': ['hello'], 'Spelling Mistakes Detected': ['Hello', 'We', 'This', 'Please', 'Thank'], 'Urgent Language Occurrence Detected': ['urgent', 'as soon as possible'], 'Personal Information Inquiry Detected': ['name', 'address', 'phone']}


### 3. Context-based Analysis

In [None]:
# This works - better to test it directly on Google Colab
from transformers import pipeline

def classify(text):
    classifier = pipeline("zero-shot-classification")
    candidate_labels = ["prizes and giveaways", "job opportunities", "banking", "update password request"]
    result = classifier(text, candidate_labels)
    labels = result["labels"]
    scores = result["scores"]
    for label, score in zip(labels, scores):
        print(f"{label}: {score}")

# Example text
text = """
This is a new job for you!
"""

classify(text)


In [5]:
import requests

def classify(text):
    headers = {"Authorization": "Bearer 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"}
    payload = {
        "inputs": text,
        "parameters": {
            "candidate_labels": ["prizes and giveaways", "job opportunities", "banking", "update password request"]
        }
    }
    response = requests.post("https://api-inference.huggingface.co/models/typeform/distilbert-base-uncased-mnli", headers=headers, json=payload)
    result = response.json()
    labels = result["labels"]
    scores = result["scores"]
    for label, score in zip(labels, scores):
        print(f"{label}: {score}")

# Example text
text = """
High-severity alert! Please look at your account and update !

Hello there ! Please find the attached document for your reference. This is an urgent request, we need to get this done as soon as possible as your credentials have been tampered with! Please provide your name, address, and phone number. Please hurry!

You can access your credentials directly on this website: www.outlook.com/update-user-link. 

If you have any questions, you can refer to www.outlook.com !

PS: open the document to update your credentials faster !! 
"""

classify(text)


update password request: 0.9863448143005371
banking: 0.005188442301005125
job opportunities: 0.0044320616871118546
prizes and giveaways: 0.0040346370078623295


### 4. Spam / Not Spam classification

In [51]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/OmkarPathak/Playing-with-datasets/master/Email%20Spam%20Filtering/emails.csv')
data.shape

(5728, 2)

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pickle

# 1. Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/OmkarPathak/Playing-with-datasets/master/Email%20Spam%20Filtering/emails.csv')

# 2. Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['spam'], test_size=0.2, random_state=42)

# 3. Create a pipeline with TF-IDF vectorizer and a classifier (e.g., Multinomial Naive Bayes)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# 4. Train the model
pipeline.fit(X_train, y_train)

# 5. Evaluate the model
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

# 6. Export model and TF-IDF vectorizer using pickle
with open('spam_classifier.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Export TF-IDF vectorizer separately
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(pipeline.named_steps['tfidf'], f)

Accuracy: 0.8516579406631762


In [None]:
def classify_subject(subject, tfidf_vectorizer):
    # Importing the saved model
    with open('spam_classifier_model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)

    # Transform the subject into a vector
    print(subject)
    example_vector = tfidf_vectorizer.transform([subject]).toarray()

    # Prediction
    prediction = loaded_model.predict(example_vector)
    return "Spam" if prediction[0] == 1 else "Not spam"


In [2]:
import pickle

# Load TF-IDF vectorizer
with open('tfidf.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load the model
with open('spam_classifier_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Example sentence to test
# example_sentence = "Get a free cruise trip! Claim now!"
example_sentence = "Hello there ! Please find the attached document for your reference. This is an urgent request, we need to get this done as soon as possible as your credentials have been tampered with! Please provide your name, address, and phone number. Please hurry!"

# Transform the example sentence using the TF-IDF vectorizer
example_vector = tfidf_vectorizer.transform([example_sentence]).toarray()

# Predict whether the example sentence is spam or not
prediction = model.predict(example_vector)

# Print the prediction
if prediction[0] == 1:
    print(f'"{example_sentence}" is spam.')
else:
    print(f'"{example_sentence}" is not spam.')

"Hello there ! Please find the attached document for your reference. This is an urgent request, we need to get this done as soon as possible as your credentials have been tampered with! Please provide your name, address, and phone number. Please hurry!" is spam.


