# Preliminaries

In [1]:
# install sent2vec
!pip install git+https://github.com/epfml/sent2vec

Collecting git+https://github.com/epfml/sent2vec
  Cloning https://github.com/epfml/sent2vec to /tmp/pip-req-build-k78wm5x6
  Running command git clone -q https://github.com/epfml/sent2vec /tmp/pip-req-build-k78wm5x6
Building wheels for collected packages: sent2vec
  Building wheel for sent2vec (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / done
[?25h  Created wheel for sent2vec: filename=sent2vec-0.0.0-cp36-cp36m-linux_x86_64.whl size=1139399 sha256=5564d8fd6978667350f5a1ac2beea444aa5bab53295e5995f3919d49deb93ebd
  Stored in directory: /tmp/pip-ephem-wheel-cache-ly9a2fon/wheels/f5/1a/52/b5f36e8120688b3f026ac0cefe9c6544905753c51d8190ff17
Successfully built sent2vec
Installing collected packages: sent2vec
Successfully installed sent2vec-0.0.0


Write requirements to file, anytime you run it, in case you have to go back and recover dependencies.

Latest known such requirements are hosted for each notebook in the companion github repo, and can be pulled down and installed here if needed. Companion github repo is located at https://github.com/azunre/transfer-learning-for-nlp

In [2]:
!pip freeze > kaggle_image_requirements.txt

# Download IMDB Movie Review Dataset
Download IMDB dataset

In [3]:
import random
import pandas as pd

## Read-in the reviews and print some basic descriptions of them

!wget -q "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!tar xzf aclImdb_v1.tar.gz

# Define Tokenization, Stop-word and Punctuation Removal Functions
Before proceeding, we must decide how many samples to draw from each class. We must also decide the maximum number of tokens per email, and the maximum length of each token. This is done by setting the following overarching hyperparameters

In [4]:
Nsamp = 1000 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

**Tokenization**

In [5]:
def tokenize(row):
    if row is None or row is '':
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

**Use regular expressions to remove unnecessary characters**

Next, we define a function to remove punctuation marks and other nonword characters (using regular expressions) from the emails with the help of the ubiquitous python regex library. In the same step, we truncate all tokens to hyperparameter maxtokenlen defined above.

In [6]:
import re

def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower() # make all characters lower case
            token = re.sub(r'[\W\d]', "", token)
            token = token[:maxtokenlen] # truncate token
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

**Stop-word removal**

Stop-words are also removed. Stop-words are words that are very common in text but offer no useful information that can be used to classify the text. Words such as is, and, the, are are examples of stop-words. The NLTK library contains a list of 127 English stop-words and can be used to filter our tokenized strings.

In [7]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')    

# print(stopwords) # see default stopwords
# it may be beneficial to drop negation words from the removal list, as they can change the positive/negative meaning
# of a sentence
# stopwords.remove("no")
# stopwords.remove("nor")
# stopwords.remove("not")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

# Assemble Embedding Vectors
The following functions are used to extract sent2vec embedding vectors for each review

In [9]:
import time
import sent2vec

s2v_model = sent2vec.Sent2vecModel()
start=time.time()
s2v_model.load_model('../input/sent2vec/wiki_unigrams.bin')
end = time.time()
print("Loading the sent2vec embedding took %d seconds"%(end-start))

Loading the sent2vec embedding took 41 seconds


In [10]:
def assemble_embedding_vectors(data):
    out = None
    for item in data:
        vec = s2v_model.embed_sentence(" ".join(item))
        if vec is not None:
            if out is not None:
                out = np.concatenate((out,vec),axis=0)
            else:
                out = vec                                            
        else:
            pass
        
        
    return out

# Putting It All Together To Assemble Dataset
Now, putting all the preprocessing steps together we assemble our dataset...

In [11]:
import os
import numpy as np

# shuffle raw data first
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p]
    header = np.asarray(header)[p]
    return data, header

# load data in appropriate form
def load_data(path):
    data, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)):
        folder = os.path.join(path, folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder, name), 'r') as reader:
                  text = reader.read()
            text = tokenize(text)
            text = stop_word_removal(text)
            text = reg_expressions(text)
            data.append(text)
            sentiments.append(sentiment)
    data_np = np.array(data)
    data, sentiments = unison_shuffle_data(data_np, sentiments)
    
    return data, sentiments

train_path = os.path.join('aclImdb', 'train')
test_path = os.path.join('aclImdb', 'test')
raw_data, raw_header = load_data(train_path)

print(raw_data.shape)
print(len(raw_header))

(25000,)
25000


In [12]:
# Subsample required number of samples
random_indices = np.random.choice(range(len(raw_header)),size=(Nsamp*2,),replace=False)
data_train = raw_data[random_indices]
header = raw_header[random_indices]

del raw_data, raw_header # huge and no longer needed, get rid of it

print("DEBUG::data_train::")
print(data_train)

DEBUG::data_train::
[list(['this', 'documentary', 'aired', 'rte', 'bbc', 'last', 'number', 'months', 'having', 'seen', 'twice', 'i', 'would', 'recommend', 'anyone', 'interest', 'media', 'documentary', 'film', 'makingbr', 'br', 'initially', 'documentary', 'meant', 'detail', 'political', 'life', 'venezuelan', 'president', 'hugo', 'chavez', 'the', 'irish', 'crew', 'set', 'intentions', 'what', 'happens', 'get', 'venezuela', 'startling', 'witness', 'first', 'hand', 'attempted', 'overthrow', 'rebel', 'factions', 'particularly', 'oil', 'concerns', 'venezuela', 'chavez', 'government', 'what', 'audience', 'witness', 'media', 'manipulates', 'situation', 'effect', 'backs', 'overthrow', 'chavez', 'distorting', 'events', 'transpire', 'coup', 'heightensbr', 'br', 'it', 'really', 'excellent', 'documentary', 'remarkable', 'piece', 'work', 'couple', 'novice', 'filmmakersbr', 'br', ''])
 list(['simply', 'one', 'funiest', 'movies', 'ive', 'ever', 'seen', 'its', 'parody', 'crimelife', 'parody', 'everythin

Display sentiments and their frequencies in the dataset, to ensure it is roughly balanced between classes

In [13]:
unique_elements, counts_elements = np.unique(header, return_counts=True)
print("Sentiments and their frequencies:")
print(unique_elements)
print(counts_elements)

Sentiments and their frequencies:
[0 1]
[1008  992]


**Featurize and Create Labels**

In [14]:
EmbeddingVectors = assemble_embedding_vectors(data_train)
print(EmbeddingVectors)

[[ 0.0102319  -0.25911504  0.32173342 ...  0.0382418  -0.28723043
   0.16901135]
 [ 0.01625078 -0.266056    0.11902541 ... -0.15257029 -0.06450036
   0.0200811 ]
 [ 0.10394059 -0.0421819   0.00287194 ...  0.00950469 -0.09494772
   0.0721978 ]
 ...
 [-0.00285244 -0.11204793 -0.04106293 ...  0.07098608  0.03362228
   0.07496912]
 [ 0.03992646 -0.06495458 -0.06775349 ...  0.13719364 -0.21264713
   0.1838117 ]
 [-0.22419588 -0.12976232  0.02702025 ...  0.03708775 -0.04663878
   0.0180573 ]]


In [15]:
data = EmbeddingVectors
del EmbeddingVectors

idx = int(0.7*data.shape[0])

# 70% of data for training
train_x = data[:idx,:]
train_y = header[:idx]
# # remaining 30% for testing
test_x = data[idx:,:]
test_y = header[idx:] 

print("train_x/train_y list details, to make sure it is of the right form:")
print(len(train_x))
print(train_x)
print(train_y[:5])
print(len(train_y))

train_x/train_y list details, to make sure it is of the right form:
1400
[[ 0.0102319  -0.25911504  0.32173342 ...  0.0382418  -0.28723043
   0.16901135]
 [ 0.01625078 -0.266056    0.11902541 ... -0.15257029 -0.06450036
   0.0200811 ]
 [ 0.10394059 -0.0421819   0.00287194 ...  0.00950469 -0.09494772
   0.0721978 ]
 ...
 [-0.09276448 -0.21992254 -0.00569641 ... -0.14773265 -0.15770121
   0.15325668]
 [-0.07607748 -0.06993847 -0.11272059 ... -0.11972501 -0.01401911
  -0.03252426]
 [-0.00111649 -0.07526951  0.07201906 ...  0.12033443  0.00771973
   0.18872222]]
[1 1 1 1 1]
1400


# Single IMDB Task Baseline

In [16]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout

input_shape = (len(train_x[0]),)
sent2vec_vectors = Input(shape=input_shape)
dense = Dense(512, activation='relu')(sent2vec_vectors)
dense = Dropout(0.3)(dense)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=sent2vec_vectors, outputs=output)

Using TensorFlow backend.


In [17]:
model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
history = model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=32,
                    nb_epoch=10, shuffle=True)

  after removing the cwd from sys.path.


Train on 1400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Add Email Task, Train Single Email Task Baseline

Read Enron dataset and get a sense for the data by printing sample messages to screen

In [18]:
# Input data files are available in the "../input/" directory.
filepath = "../input/enron-email-dataset/emails.csv"

# Read the enron data into a pandas.DataFrame called emails
emails = pd.read_csv(filepath)

print("Successfully loaded {} rows and {} columns!".format(emails.shape[0], emails.shape[1]))
print(emails.head())

Successfully loaded 517401 rows and 2 columns!
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


Separate headers from the message bodies

In [19]:
import email

def extract_messages(df):
    messages = []
    for item in df["message"]:
        # Return a message object structure from a string
        e = email.message_from_string(item)    
        # get message body  
        message_body = e.get_payload()
        messages.append(message_body)
    print("Successfully retrieved message body from e-mails!")
    return messages

bodies = extract_messages(emails)

del emails

Successfully retrieved message body from e-mails!


In [20]:
# extract random 10000 enron email bodies for building dataset
import random
bodies_df = pd.DataFrame(random.sample(bodies, 10000))

del bodies # these are huge, no longer needed, get rid of them

# expand default pandas display options to make emails more clearly visible when printed
pd.set_option('display.max_colwidth', 300)

bodies_df.head() # you could do print(bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

Unnamed: 0,0
0,"Not much has happened with the federal filings since we submitted them (SEC \napplication was filed on February 4, FERC application was filed on March 3). \nAnn has been keeping everyone updated on the DOJ review of the HSR filing. \n\nWith regard to the FERC filing, FERC separated the section..."
1,"---------------------- Forwarded by Robin Rodrigue/HOU/ECT on 12/04/2000 \n04:22 PM ---------------------------\n\n\nChris Abel\n08/11/2000 11:38 AM\nTo: Michael Benien/Corp/Enron@ENRON, Daniel Falcone/Corp/Enron@ENRON, Michael \nE Moscoso/HOU/ECT@ECT, Gabriel Monroy/HOU/ECT@ECT, Robin \nRodrigu..."
2,"What a mess! \n\n\n\n\nsuzanneadams@att.net on 04/09/2001 06:50:22 PM\nTo: Kay.Mann@enron.com\ncc: \n\nSubject: Re: Doctor's Report\n\nYou know I forgot all about the conference, but the doc \npicked the day not me. Anyhow you look at it though, \nthe timing is good. Cool! I'm going to have..."
3,"----- Forwarded by Cindy Derecskey/Corp/Enron on 12/08/2000 09:45 AM -----\n\n\tEmma Facy@ECT\n\t12/08/2000 03:41 AM\n\t\t \n\t\t To: Jackie Gentle/LON/ECT@ECT, Mark Pickering/LON/ECT@ECT, John \nSherriff/LON/ECT@ECT, Karen Denne/Corp/Enron@ENRON, Cindy \nDerecskey/Corp/Enron@Enron, Ann M Schmid..."
4,"Kay,\n\nI am not sure what we decided on Friday. Are we going to send another (more \nfriendly) letter to the Westinghouse people evidencing a willingness to \ndiscuss the draw issue? Please let me know at your convenience.\n\nStuart\n"


Read and Preprocess Fraudulent "419" Email Corpus

In [21]:
filepath = "../input/fraudulent-email-corpus/fradulent_emails.txt"
with open(filepath, 'r',encoding="latin1") as file:
    data = file.read()

Split on the code word `From r` appearing close to the beginning of each email

In [22]:
fraud_emails = data.split("From r")

del data

print("Successfully loaded {} spam emails!".format(len(fraud_emails)))

Successfully loaded 3978 spam emails!


In [23]:
fraud_bodies = extract_messages(pd.DataFrame(fraud_emails,columns=["message"]))

del fraud_emails

fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])

del fraud_bodies

fraud_bodies_df.head() # you could do print(fraud_bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

Successfully retrieved message body from e-mails!


Unnamed: 0,0
0,"FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-27-587908.\nE-MAIL: (james_ngola2002@maktoob.com).\n\nURGENT BUSINESS ASSISTANCE AND PARTNERSHIP.\n\n\nDEAR FRIEND,\n\nI AM ( DR.) JAMES NGOLA, THE PERSONAL ASSISTANCE TO THE LATE CONGOLESE (PRESIDENT LAURENT KABILA) WHO WAS ASSASSINATED BY HIS BODY G..."
1,"Dear Friend,\n\nI am Mr. Ben Suleman a custom officer and work as Assistant controller of the Customs and Excise department Of the Federal Ministry of Internal Affairs stationed at the Murtala Mohammed International Airport, Ikeja, Lagos-Nigeria.\n\nAfter the sudden death of the former Head of s..."
2,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
3,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
4,"Dear sir, \n \nIt is with a heart full of hope that I write to seek your help in respect of the context below. I am Mrs. Maryam Abacha the former first lady of the former Military Head of State of Nigeria General Sani Abacha whose sudden death occurred on 8th of June 1998 as a result of cardiac ..."


Convert everything to lower-case, truncate to maxtokens and truncate each token to maxtokenlen

In [24]:
EnronEmails = bodies_df.iloc[:,0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)

del bodies_df

SpamEmails = fraud_bodies_df.iloc[:,0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)

del fraud_bodies_df

raw_data = pd.concat([SpamEmails,EnronEmails], axis=0).values

In [25]:
print("Shape of combined data is:")
print(raw_data.shape)
print("Data is:")
print(raw_data)

# create corresponding labels
Categories = ['spam','notspam']
header = ([1]*Nsamp)
header.extend(([0]*Nsamp))

Shape of combined data is:
(2000,)
Data is:
[list(['', 'netherlotto', 'corporation', 'netherlotto', 'corporationc', '', 'c', 'nl', 'db', 'amsterdamc', 'the', 'netherlandse', '', 'froma', 'the', 'desk', 'of', 'the', 'director', 'promotionsc', 'international', 'promotionsfprize', 'award', 'departmentc', 'refa', 'eguyis', '', '', '', 'we', 'pleased', 'inform', 'announcement', 'thecth', 'septembere', 'c', 'winners', 'netherlotto', 'corpef', 'international', 'programs', 'held', 'th', 'june', 'e', 'you', 'f', 'companyc', 'attached', 'ticket', 'number', 'c', 'with', 'serial', 'number', 'a', 'drew', 'lucky', 'numbers', 'cand', 'consequently', 'category', 'ce', 'you', 'therefore', 'approved', 'a', 'lump', 'sum', 'pay', 'uscce', 'cash', 'credited', 'file', 'ref', 'noe', 'eguyis', 'this', 'total', 'prize', 'money', 'us', 'cce', 'shared', 'among', 'nineteen', 'international', 'winners', 'the', 'category', 'ce', 'all', 'participants', 'selected', 'througha', 'computer', 'ballot', 'system', 'drawn',

We are now ready to convert these into numerical vectors!!

**Featurize and Create Labels**

In [26]:
EmbeddingVectors = assemble_embedding_vectors(raw_data)
print(EmbeddingVectors)

[[-0.03317764  0.07362798  0.19308645 ... -0.04788522 -0.01147885
  -0.00251804]
 [-1.1175807  -0.751737    0.55355763 ... -0.6607673   0.21230516
  -0.30602062]
 [-0.0673231   0.01148732 -0.03144034 ... -0.0213981  -0.03132029
   0.01879456]
 ...
 [-0.13396886 -0.03065061  0.11255352 ... -0.1317871  -0.23691481
   0.02130968]
 [-0.07162081  0.07984769 -0.12441923 ... -0.3069535  -0.2733425
  -0.00550599]
 [-0.20956394  0.1029416  -0.01892696 ...  0.11135242 -0.15208471
   0.18950589]]


In [27]:
# shuffle raw data first
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p,:]
    header = np.asarray(header)[p]
    return data, header

data, header = unison_shuffle_data(EmbeddingVectors, header)

idx = int(0.7*data.shape[0])

# 70% of data for training
train_x2 = data[:idx,:]
train_y2 = header[:idx]
# # remaining 30% for testing
test_x2 = data[idx:,:]
test_y2 = header[idx:] 

print("train_x2/train_y2 (emails) list details, to make sure it is of the right form:")
print(len(train_x2))
print(train_x2)
print(train_y2[:5])
print(len(train_y2))

train_x2/train_y2 (emails) list details, to make sure it is of the right form:
1400
[[-0.07138915 -0.02298339 -0.01392924 ...  0.01740107 -0.13331647
  -0.06678196]
 [ 0.12193154  0.2283633   0.08305462 ...  0.11863346 -0.25589672
  -0.09505108]
 [-1.1175805  -0.751737    0.55355763 ... -0.6607673   0.21230514
  -0.30602062]
 ...
 [-0.05288233 -0.09583424 -0.09330093 ...  0.18604138 -0.22734706
  -0.13026707]
 [-0.15514517 -0.01878629  0.05485336 ...  0.06092846 -0.19817528
   0.17559372]
 [ 0.06505737 -0.21145517 -0.19747108 ... -0.02044362 -0.14387816
   0.0815329 ]]
[1 0 1 1 1]
1400


Train "just email" single-task shallow neural network

In [28]:
input_shape = (len(train_x2[0]),)
sent2vec_vectors = Input(shape=input_shape)
dense = Dense(512, activation='relu')(sent2vec_vectors)
dense = Dropout(0.3)(dense)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=sent2vec_vectors, outputs=output)

model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
history = model.fit(train_x2, train_y2, validation_data=(test_x2, test_y2), batch_size=32,
                    nb_epoch=10, shuffle=True)

  # This is added back by InteractiveShellApp.init_path()


Train on 1400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# "Double-Task" Email and IMDB System

In [29]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.layers.merge import concatenate

input1_shape = (len(train_x[0]),)
input2_shape = (len(train_x2[0]),)
sent2vec_vectors1 = Input(shape=input1_shape)
sent2vec_vectors2 = Input(shape=input2_shape)
combined = concatenate([sent2vec_vectors1,sent2vec_vectors2])
dense1 = Dense(512, activation='relu')(combined)
dense1 = Dropout(0.3)(dense1)
output1 = Dense(1, activation='sigmoid',name='classification1')(dense1)
output2 = Dense(1, activation='sigmoid',name='classification2')(dense1)
model = Model(inputs=[sent2vec_vectors1,sent2vec_vectors2], outputs=[output1,output2])

In [30]:
model.compile(loss={'classification1': 'binary_crossentropy', 
                    'classification2': 'binary_crossentropy'},
              optimizer='adam', metrics=['accuracy'])
history = model.fit([train_x,train_x2],[train_y,train_y2],
                    validation_data=([test_x,test_x2],[test_y,test_y2]),
                                     batch_size=32, nb_epoch=10, shuffle=True)

  


Train on 1400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
from IPython.display import HTML
def create_download_link(title = "Download file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

#create_download_link(filename='file.svg')

In [32]:
!rm -rf aclImdb
!rm aclImdb_v1.tar.gz