# **Import librares**

In [29]:
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf

### **Load Dataset**

In [30]:
df=pd.read_csv("/content/big_email_data.csv", engine='python', encoding='UTF-8')
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Corporate inquiries,495
Student inquiries,486
Academic collaboration inquiries,445


# **Preprocessing**

In [31]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
df['lower_case'] = df['email_content'].apply(lambda x: x.lower().strip().replace('\n', ' ').replace('\r', ' '))

df['alphabatic'] = df['lower_case'].apply(lambda x: re.sub(r'[^a-zA-Z\']', ' ', x)).apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
df['without-link'] = df['alphabatic'].apply(lambda x: re.sub(r'http\S+', '', x))

tokenizer = RegexpTokenizer(r'\w+')
df['Special_word'] = df.apply(lambda row: tokenizer.tokenize(row['lower_case']), axis=1)

stop = [word for word in stopwords.words('english') if word not in ["my","haven't","aren't","can","no", "why", "through", "herself", "she", "he", "himself", "you", "you're", "myself", "not", "here", "some", "do", "does", "did", "will", "don't", "doesn't", "didn't", "won't", "should", "should've", "couldn't", "mightn't", "mustn't", "shouldn't", "hadn't", "wasn't", "wouldn't"]]

df['stop_words'] = df['Special_word'].apply(lambda x: [item for item in x if item not in stop])
df['stop_words'] = df['stop_words'].astype('str')

df['short_word'] = df['stop_words'].str.findall('\w{2,}')
df['string']=df['short_word'].str.join(' ')

df['Text'] = df['string'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [33]:
df

Unnamed: 0,email_content,label,lower_case,alphabatic,without-link,Special_word,stop_words,short_word,string,Text
0,"Dear Professor Smith,I hope this message finds...",Student inquiries,"dear professor smith,i hope this message finds...",dear professor smith i hope this message finds...,dear professor smith i hope this message finds...,"[dear, professor, smith, i, hope, this, messag...","['dear', 'professor', 'smith', 'hope', 'messag...","[dear, professor, smith, hope, message, finds,...",dear professor smith hope message finds you we...,dear professor smith hope message find you wel...
1,"Hello Dr. Johnson,I am writing to inquire abou...",Student inquiries,"hello dr. johnson,i am writing to inquire abou...",hello dr johnson i am writing to inquire abou...,hello dr johnson i am writing to inquire abou...,"[hello, dr, johnson, i, am, writing, to, inqui...","['hello', 'dr', 'johnson', 'writing', 'inquire...","[hello, dr, johnson, writing, inquire, course,...",hello dr johnson writing inquire course materi...,hello dr johnson writing inquire course materi...
2,"Dear Ms. Brown,I am a graduate student in the ...",Student inquiries,"dear ms. brown,i am a graduate student in the ...",dear ms brown i am a graduate student in the ...,dear ms brown i am a graduate student in the ...,"[dear, ms, brown, i, am, a, graduate, student,...","['dear', 'ms', 'brown', 'graduate', 'student',...","[dear, ms, brown, graduate, student, environme...",dear ms brown graduate student environmental s...,dear m brown graduate student environmental sc...
3,"Hi Professor Garcia,I hope you are doing well....",Student inquiries,"hi professor garcia,i hope you are doing well....",hi professor garcia i hope you are doing well ...,hi professor garcia i hope you are doing well ...,"[hi, professor, garcia, i, hope, you, are, doi...","['hi', 'professor', 'garcia', 'hope', 'you', '...","[hi, professor, garcia, hope, you, well, curre...",hi professor garcia hope you well currently wo...,hi professor garcia hope you well currently wo...
4,"Dear Dr. Wilson,I hope you're having a great d...",Student inquiries,"dear dr. wilson,i hope you're having a great d...",dear dr wilson i hope you're having a great d...,dear dr wilson i hope you're having a great d...,"[dear, dr, wilson, i, hope, you, re, having, a...","['dear', 'dr', 'wilson', 'hope', 'you', 'great...","[dear, dr, wilson, hope, you, great, day, want...",dear dr wilson hope you great day wanted ask y...,dear dr wilson hope you great day wanted ask y...
...,...,...,...,...,...,...,...,...,...,...
1421,"Dear Dr. Harris, I am with a consulting firm i...",Corporate inquiries,"dear dr. harris, i am with a consulting firm i...",dear dr harris i am with a consulting firm i...,dear dr harris i am with a consulting firm i...,"[dear, dr, harris, i, am, with, a, consulting,...","['dear', 'dr', 'harris', 'consulting', 'firm',...","[dear, dr, harris, consulting, firm, intereste...",dear dr harris consulting firm interested disc...,dear dr harris consulting firm interested disc...
1422,"Dear Dr. Brown, I hope you're having a good da...",Student inquiries,"dear dr. brown, i hope you're having a good da...",dear dr brown i hope you're having a good da...,dear dr brown i hope you're having a good da...,"[dear, dr, brown, i, hope, you, re, having, a,...","['dear', 'dr', 'brown', 'hope', 'you', 'good',...","[dear, dr, brown, hope, you, good, day, wanted...",dear dr brown hope you good day wanted inquire...,dear dr brown hope you good day wanted inquire...
1423,"Dear Dr. Patel, I am interested in collaborati...",Academic collaboration inquiries,"dear dr. patel, i am interested in collaborati...",dear dr patel i am interested in collaborati...,dear dr patel i am interested in collaborati...,"[dear, dr, patel, i, am, interested, in, colla...","['dear', 'dr', 'patel', 'interested', 'collabo...","[dear, dr, patel, interested, collaborating, r...",dear dr patel interested collaborating researc...,dear dr patel interested collaborating researc...
1424,"Dear HOD, I represent a nonprofit organization...",Corporate inquiries,"dear hod, i represent a nonprofit organization...",dear hod i represent a nonprofit organization...,dear hod i represent a nonprofit organization...,"[dear, hod, i, represent, a, nonprofit, organi...","['dear', 'hod', 'represent', 'nonprofit', 'org...","[dear, hod, represent, nonprofit, organization...",dear hod represent nonprofit organization look...,dear hod represent nonprofit organization look...


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

x_train, x_test, y_train, y_test = train_test_split(df["Text"],df["label"], test_size = 0.25, random_state = 42)
count_vect = CountVectorizer(ngram_range=(1, 2))
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

print (x_train_tfidf.shape,x_test_tfidf.shape, y_train.shape, y_test.shape)

(1069, 6156) (357, 6156) (1069,) (357,)


In [36]:
joblib.dump(count_vect, 'count_vect.pkl')
#model = joblib.load('count_vect.pkl')

['count_vect.pkl']



## **Naive Bayes(Multinomial)**

In [40]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

class MultinomialNB1:
    def __init__(self):
        self.class_priors = {}
        self.word_probs = {}
        self.vocabulary = set()
        self.total_words_per_class = {}

    def fit(self, X, y):

        self.classes = np.unique(y)
        n_docs = len(y)


        for c in self.classes:
            n_class_docs = np.sum(y == c)
            self.class_priors[c] = n_class_docs / n_docs


        for c in self.classes:

            class_docs = X[y == c]
            total_words = sum([len(doc.split()) for doc in class_docs])
            self.total_words_per_class[c] = total_words


            word_count = {}
            for doc in class_docs:
                for word in doc.split():
                    if word in word_count:
                        word_count[word] += 1
                    else:
                        word_count[word] = 1


            self.word_probs[c] = {word: (count + 1) / (total_words + len(word_count))
                                   for word, count in word_count.items()}
            self.vocabulary.update(word_count.keys())

    def predict(self, X):
        predictions = []
        for doc in X:
            class_probs = {}
            for c in self.classes:

                class_prob = np.log(self.class_priors[c])
                for word in doc.split():

                    word_prob = self.word_probs[c].get(word, 1 / (self.total_words_per_class[c] + len(self.vocabulary)))
                    class_prob += np.log(word_prob)
                class_probs[c] = class_prob


            predictions.append(max(class_probs, key=class_probs.get))
        return np.array(predictions)


if __name__ == "__main__":

    data = pd.read_csv('big_email_data.csv')
    X = data['email_content'].values
    y = data['label'].values


    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)


    X_train_dense = X_train_tfidf.toarray()
    X_test_dense = X_test_tfidf.toarray()

    mnb = MultinomialNB1()
    mnb.fit(X_train, y_train)


    y_pred = mnb.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

Accuracy: 0.965034965034965
                                  precision    recall  f1-score   support

Academic collaboration inquiries       0.95      0.94      0.95        86
             Corporate inquiries       0.95      0.95      0.95       101
               Student inquiries       0.99      1.00      0.99        99

                        accuracy                           0.97       286
                       macro avg       0.96      0.96      0.96       286
                    weighted avg       0.96      0.97      0.96       286



In [17]:
scores = cross_val_score(mnb, x_train_tfidf,y_train, cv=10)
print(accuracy_score(y_test,y_pred3))
print ("Cross-validated scores:", scores)

0.9495798319327731
Cross-validated scores: [0.93457944 0.94392523 0.91588785 0.93457944 0.91588785 0.98130841
 0.93457944 0.92523364 0.91588785 0.94339623]


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report


file_path = '/content/big_email_data.csv'
data = pd.read_csv(file_path)


data = data[['email_content', 'label']]
data.dropna(inplace=True)
X = data['email_content'].values
y = data['label'].values


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)


max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_len)


X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)


model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))

Epoch 1/10




[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 267ms/step - accuracy: 0.5559 - loss: 0.9814 - val_accuracy: 0.8947 - val_loss: 0.4257
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 328ms/step - accuracy: 0.9019 - loss: 0.3288 - val_accuracy: 0.9254 - val_loss: 0.2267
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 325ms/step - accuracy: 0.9607 - loss: 0.1329 - val_accuracy: 0.9430 - val_loss: 0.1958
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 253ms/step - accuracy: 0.9740 - loss: 0.0999 - val_accuracy: 0.9342 - val_loss: 0.1743
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 282ms/step - accuracy: 0.9849 - loss: 0.0633 - val_accuracy: 0.9430 - val_loss: 0.1745
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 316ms/step - accuracy: 0.9966 - loss: 0.0268 - val_accuracy: 0.9518 - val_loss: 0.1830
Epoch 7/10
[1m29/29[0m [32m━━━━

In [22]:
new_email = "Dear HOD, I am interested in collaborating on research regarding the impact of technology on learning. Would you be available for a discussion? Thank you! Best, Ava Young"

new_email_sequence = tokenizer.texts_to_sequences([new_email])
new_email_padded = pad_sequences(new_email_sequence, maxlen=max_len)

prediction = model.predict(new_email_padded)
predicted_class = np.argmax(prediction, axis=1)

predicted_label = label_encoder.inverse_transform(predicted_class)
print(f"The email is classified as: {predicted_label[0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
The email is classified as: Academic collaboration inquiries
