<a href="https://colab.research.google.com/github/ZohebAbai/Deep-Learning-Projects/blob/master/Emails_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import Mbox data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!cp 'drive/My Drive/Dataset/NLP Assignment Dataset/mbox_dataset/All mail Including Spam and Trash.mbox' .

## Extract Emails from it in CSV format

In [0]:
import mailbox
import csv
from email.header import decode_header
#please join tinyurl.com/nlp-tsai for joining the group
def get_message(message):
    if not message.is_multipart():
        return message.get_payload()
    content = ""
    for msg in message.get_payload():
        content = content + str(msg.get_payload()) + '\n'
    return content

writer = csv.writer(open("emails.csv", "w"))
writer.writerow(['date', 'from', 'subject', 'content'])

for message in mailbox.mbox("All mail Including Spam and Trash.mbox"):
    content = get_message(message)
    if message['subject'] is not None:
        subject, encoding = decode_header(message['subject'])[0]
        if encoding == 'utf-8':
            subject = subject.decode(encoding)
        else:
            subject = subject
        writer.writerow([message["date"], message["from"].strip('>').split('<')[-1], subject , content])

### Copy the csv file to drive for later use

In [0]:
!cp 'emails.csv' 'drive/My Drive/Dataset/NLP Assignment Dataset/' 

## Explore the csv file

In [0]:
import pandas as pd
df = pd.read_csv('emails.csv')
df.head(10)

In [0]:
df['subs'] = df['from'].map(str) + ' - ' + df['subject'].map(str)

In [0]:
df.subs

In [0]:
df.info()

In [0]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

**Manually Tag the dataset by downloading it locally**

## Data processing

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# create the tokenizer
t = Tokenizer()
# fit the tokenizer
t.fit_on_texts(df.Subs.values)

# summarize what was learned
#print(t.word_counts)
#print(t.document_count)
#print(t.word_index)
#print(t.word_docs)

# integer encode documents
encoded_text = t.texts_to_sequences(df.Subs.values)
encoded_text = pad_sequences(encoded_text)
print(encoded_text.shape)

(100, 33)


In [0]:
encoded_labels = pd.get_dummies(df['tags']).values
print(encoded_labels.shape)

## Train Valid Split

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(encoded_text, encoded_labels, test_size = 0.20, random_state = 101)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

## Model Training

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SpatialDropout1D, Dropout, LSTM, Embedding

# define the LSTM model
model = Sequential()
model.add(Embedding(len(t.word_index)+1, 1024, input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(512, dropout=0.2, return_sequences=True))
model.add(LSTM(256, dropout=0.1, recurrent_dropout=0.2))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint
# define the checkpoint
checkpoint = ModelCheckpoint("model.h5", monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# Fit the Model to data
model.fit(X_train, Y_train, epochs=30, batch_size=128, validation_data=(X_test, Y_test), verbose=1, callbacks=[checkpoint])

## Model Evaluation

In [0]:
# load the model
from tensorflow.keras.models import load_model
bestmodel = load_model("model.h5")

score, acc = bestmodel.evaluate(X_test, Y_test, batch_size=128, verbose=0)
print('Test score:', score)
print('Test accuracy:', acc)

In [0]:
from sklearn.metrics import confusion_matrix,classification_report
Y_pred = bestmodel.predict_classes(X_test, batch_size = 128)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix \n", confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

In [0]:
!cp -r 'model.h5' 'drive/My Drive/Dataset/First Run with dataset/'

## Load pretrained model

In [0]:
!cp 'drive/My Drive/Dataset/First Run with dataset/model.h5' .

In [0]:
# load the model
from tensorflow.keras.models import load_model
bestmodel = load_model("model.h5")

## Predict on a text

In [0]:
text = ['amazon.in - Invoice of your bill']
#vectorizing the text by the pre-fitted tokenizer instance
text = t.texts_to_sequences(text)
#padding the text to have exactly the same shape as `embedding` input
text = pad_sequences(text, maxlen=50, dtype='int32', value=0)
#print(text)

label = bestmodel.predict(text, batch_size=1, verbose = 1)[0]
print("Finance") if(pd.np.argmax(label) == 2) else print("MaybeUseful") if (pd.np.argmax(label) == 1) else print("NotFinance")

## Save the dataframes

In [0]:
df.tag.value_counts()

NotFinance     8113
Finance         105
MaybeUseful      38
Name: tag, dtype: int64

In [0]:
df[df.tag=='Finance'].to_csv('financial_emails.csv', index=False)

In [0]:
df[df.tag=='MaybeUseful'].to_csv('maybeuseful_emails.csv', index=False)

**Anonymize the dataset before saving it in drive**

## Save the csv files for later use

In [0]:
!cp 'financial_emails.csv' 'drive/My Drive/Dataset/NLP Assignment Dataset/' 

In [0]:
!cp 'maybeuseful_emails.csv' 'drive/My Drive/Dataset/NLP Assignment Dataset/' 