In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score
from imblearn.over_sampling import SMOTE


In [2]:
# Sample data
data = {
    'body': [
        'Hi John, Please send me the report by tomorrow.',
        'Reminder: Staff meeting at 3 pm today',
        'Urgent: Your account needs verification',
        'Hi Alice, I need your help with the presentation',
        'Meeting with the client at 10 am tomorrow',
        'Reminder: Your project deadline is due next week',
        'Can we meet at the coffee shop in the evening',
        "Hi, just wanted to check in and see how you're doing. Let me know if you need anything!",
        "Reminder: Our team meeting is scheduled for next Wednesday at 2pm.",
        "Thanks for your email. I'll get back to you as soon as possible.",
        "Just wanted to follow up on the project we discussed. Let me know if you have any questions.",
        "I'm out of the office today, but I'll be back tomorrow. If you need immediate assistance, please contact my colleague, John.",
        "Urgent: Important security update required for your account",
        "Invitation: Company-wide picnic this Saturday",
        "Your order has shipped!",
        "Hi there, just wanted to introduce myself",
        "Follow up: Job application status",
        "Urgent: Payment required on overdue account",
        "Reminder: Parent-teacher conference tomorrow",
        "Looking forward to our meeting tomorrow",
        "Urgent: Action required on your recent purchase",
        "Feedback requested: How was your experience?",
        "Invitation: Charity auction this Friday",
        "Congratulations on your promotion!",
        "Urgent: Your help is needed for a time-sensitive matter",
        "Reminder: Rent due in 2 days",
        "Thank you for your recent purchase!",
        "Quick question about the project",
        "Urgent: Change your password immediately",
        "Don't forget to complete your survey!",
        "Reminder: Doctor's appointment next Wednesday",
        "Urgent: Update your account information",
        "Confirmation: Your flight has been booked",
        "Welcome to our service!",
        "Reminder: Volunteer orientation tomorrow",
        "Urgent: Your assistance is needed for a critical issue",
        "Important notice regarding your account",
        "Don't miss our sale this weekend!",
        "Reminder: Submit your timesheet by Friday",
        "Urgent: Your action required for upcoming deadline",
        "Invitation: Networking event next Thursday",
        "Thank you for your donation!",
        "Update on the project",
        "Urgent: Unauthorized access detected on your account",
        "Reminder: Dentist appointment next Tuesday",
        "Happy holidays from our team!",
        "Urgent: Your account has been compromised",
        "Confirmation: Your hotel reservation is confirmed",
        "Reminder: Submit your expense report by Friday",
        "New feature announcement: Try it out now!",
        "Urgent: Issue detected with your recent purchase",
        "Invitation: Fundraiser gala this Saturday",
        "Hello",
        "Congratulations you have completed your course",
        "Grab this offer and get 25%discount",
        "We will be having a group meeting tomeit is Mandatory for everyone",
        "I heard that you are not feeling well, Take care"
        
    ],
    'urgency': [
        'urgent', 'urgent', 'urgent',
        'not urgent', 'urgent', 'urgent', 'not urgent', 'not urgent', 'urgent', 'not urgent', 'not urgent', 'not urgent',
        'urgent', 'not urgent', 'not urgent', 'not urgent', 'not urgent', 'urgent', 'not urgent', 'not urgent', 'not urgent', 
        'urgent', 'not urgent', 'urgent', 'not urgent', 'not urgent', 'urgent', 'urgent', 'not urgent', 'not urgent',
        'urgent', 'not urgent', 'not urgent', 'urgent', 'urgent', 'not urgent', 'urgent', 'not urgent', 'not urgent', 'not urgent',
        'urgent', 'not urgent', 'not urgent', 'not urgent', 'not urgent', 'urgent', 'not urgent', 'not urgent', 'not urgent', 
        'urgent', 'urgent', 'urgent','not urgent', 'not urgent', 'not urgent', 'urgent', 'not urgent']
}

df = pd.DataFrame(data)
df


Unnamed: 0,body,urgency
0,"Hi John, Please send me the report by tomorrow.",urgent
1,Reminder: Staff meeting at 3 pm today,urgent
2,Urgent: Your account needs verification,urgent
3,"Hi Alice, I need your help with the presentation",not urgent
4,Meeting with the client at 10 am tomorrow,urgent
5,Reminder: Your project deadline is due next week,urgent
6,Can we meet at the coffee shop in the evening,not urgent
7,"Hi, just wanted to check in and see how you're...",not urgent
8,Reminder: Our team meeting is scheduled for ne...,urgent
9,Thanks for your email. I'll get back to you as...,not urgent


In [3]:
# preprocess the email body text
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub('[^a-zA-Z\s]', '', text)
    # Remove stopwords
    stopwords = ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'will', 'would', 'should']
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

In [4]:
print(df['urgency'].notnull().sum())


57


In [5]:
X = [df['urgency'].notnull()]
y = df[df['urgency'].notnull()]['urgency']

In [6]:
# Preprocess the email body text
df['body'] = df['body'].apply(preprocess_text)

# Create a count vectorizer to extract features from the email body text
vectorizer = CountVectorizer()

# Fit the vectorizer on the preprocessed email body text
X = vectorizer.fit_transform(df['body'])

# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=52)

# Create a logistic regression model to predict the urgency of the email
model = LogisticRegression()

#Train the model on the training set
model.fit(X_train, y_train)

#Predict the urgency of the emails in the testing set
y_pred = model.predict(X_test)

In [7]:
#Evaluate the performance of the model using accuracy, precision, and ROC AUC score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='urgent')
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

#Print the confusion matrix and the performance metrics
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy)
print('Precision:', precision)
print('ROC AUC score:', roc_auc)

Confusion matrix:
[[4 0]
 [1 1]]
Accuracy: 0.8333333333333334
Precision: 1.0
ROC AUC score: 0.625


In [8]:
def predict_urgency(body):
    # Preprocess the email body text
    body = preprocess_text(body)
    # Transform the preprocessed text using the fitted vectorizer
    X_new = vectorizer.transform([body])
    # Predict the urgency of the new email using the trained model
    urgency = model.predict(X_new)[0]
    return urgency


In [9]:
body = input('Enter email body:')
urgency = predict_urgency(body)
print('Predicted urgency:', urgency)


Enter email body: Hi


Predicted urgency: not urgent
