# Preliminaries

In [1]:
# install sent2vec
!pip install git+https://github.com/epfml/sent2vec

Collecting git+https://github.com/epfml/sent2vec
  Cloning https://github.com/epfml/sent2vec to /tmp/pip-req-build-dcne7kr1
  Running command git clone -q https://github.com/epfml/sent2vec /tmp/pip-req-build-dcne7kr1
Building wheels for collected packages: sent2vec
  Building wheel for sent2vec (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / done
[?25h  Created wheel for sent2vec: filename=sent2vec-0.0.0-cp36-cp36m-linux_x86_64.whl size=1137369 sha256=e731ace1b828f01b8ff79fc319495c980451af429c14c91ee788f30c091afcab
  Stored in directory: /tmp/pip-ephem-wheel-cache-0nw9k9lq/wheels/f5/1a/52/b5f36e8120688b3f026ac0cefe9c6544905753c51d8190ff17
Successfully built sent2vec
Installing collected packages: sent2vec
Successfully installed sent2vec-0.0.0


Write requirements to file, anytime you run it, in case you have to go back and recover dependencies.

Latest known such requirements are hosted for each notebook in the companion github repo, and can be pulled down and installed here if needed. Companion github repo is located at https://github.com/azunre/transfer-learning-for-nlp

In [2]:
!pip freeze > kaggle_image_requirements.txt

# Download IMDB Movie Review Dataset
Download IMDB dataset

In [3]:
import random
import pandas as pd

## Read-in the reviews and print some basic descriptions of them

!wget -q "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!tar xzf aclImdb_v1.tar.gz

# Define Tokenization, Stop-word and Punctuation Removal Functions
Before proceeding, we must decide how many samples to draw from each class. We must also decide the maximum number of tokens per email, and the maximum length of each token. This is done by setting the following overarching hyperparameters

In [4]:
Nsamp = 1000 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

**Tokenization**

In [5]:
def tokenize(row):
    if row is None or row is '':
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

**Use regular expressions to remove unnecessary characters**

Next, we define a function to remove punctuation marks and other nonword characters (using regular expressions) from the emails with the help of the ubiquitous python regex library. In the same step, we truncate all tokens to hyperparameter maxtokenlen defined above.

In [6]:
import re

def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower() # make all characters lower case
            token = re.sub(r'[\W\d]', "", token)
            token = token[:maxtokenlen] # truncate token
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

**Stop-word removal**

Stop-words are also removed. Stop-words are words that are very common in text but offer no useful information that can be used to classify the text. Words such as is, and, the, are are examples of stop-words. The NLTK library contains a list of 127 English stop-words and can be used to filter our tokenized strings.

In [7]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')    

# print(stopwords) # see default stopwords
# it may be beneficial to drop negation words from the removal list, as they can change the positive/negative meaning
# of a sentence
# stopwords.remove("no")
# stopwords.remove("nor")
# stopwords.remove("not")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

# Assemble Embedding Vectors
The following functions are used to extract sent2vec embedding vectors for each review

In [9]:
import time
import sent2vec

s2v_model = sent2vec.Sent2vecModel()
start=time.time()
s2v_model.load_model('../input/sent2vec/wiki_unigrams.bin')
end = time.time()
print("Loading the sent2vec embedding took %d seconds"%(end-start))

Loading the sent2vec embedding took 7 seconds


In [10]:
def assemble_embedding_vectors(data):
    out = None
    for item in data:
        vec = s2v_model.embed_sentence(" ".join(item))
        if vec is not None:
            if out is not None:
                out = np.concatenate((out,vec),axis=0)
            else:
                out = vec                                            
        else:
            pass
        
        
    return out

# Putting It All Together To Assemble Dataset
Now, putting all the preprocessing steps together we assemble our dataset...

In [11]:
import os
import numpy as np

# shuffle raw data first
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p]
    header = np.asarray(header)[p]
    return data, header

# load data in appropriate form
def load_data(path):
    data, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)):
        folder = os.path.join(path, folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder, name), 'r') as reader:
                  text = reader.read()
            text = tokenize(text)
            text = stop_word_removal(text)
            text = reg_expressions(text)
            data.append(text)
            sentiments.append(sentiment)
    data_np = np.array(data)
    data, sentiments = unison_shuffle_data(data_np, sentiments)
    
    return data, sentiments

train_path = os.path.join('aclImdb', 'train')
test_path = os.path.join('aclImdb', 'test')
raw_data, raw_header = load_data(train_path)

print(raw_data.shape)
print(len(raw_header))

(25000,)
25000


In [12]:
# Subsample required number of samples
random_indices = np.random.choice(range(len(raw_header)),size=(Nsamp*2,),replace=False)
data_train = raw_data[random_indices]
header = raw_header[random_indices]

del raw_data, raw_header # huge and no longer needed, get rid of it

print("DEBUG::data_train::")
print(data_train)

DEBUG::data_train::
[list(['its', 'sad', 'romanian', 'audiences', 'still', 'populated', 'vulgar', 'uneducated', 'individuals', 'relish', 'kind', 'cheap', 'demonstrative', 'shows', 'superficial', 'brutal', 'garcea', 'series', 'vacanta', 'mare', 'childplays', 'the', 'difference', 'mugur', 'mihäescu', 'doru', 'octavian', 'dumitru', 'subartisans', 'never', 'presume', 'claim', 'shows', 'art', 'pintilie', '', 'years', 'ago', 'made', 'good', 'movie', 'duminicä', 'la', 'ora', 'sase', 'followed', 'another', 'one', 'nice', 'enough', 'reconstituirea', 'tries', 'declare', 'filmlenghts', 'art', 'works', '', 'but', 'unfortunately', 'masters', 'way', 'limited', 'level', 'specifically', 'cinematographic', 'means', 'expression', 'as', 'such', 'niki', 'ardelean', 'offers', 'sample', 'how', 'not', '', 'merit'])
 list(['one', 'previous', 'reviewers', 'wrote', 'appeared', 'middle', 'ground', 'opinions', 'love', 'story', 'one', 'loved', 'hated', 'it', 'but', 'seems', 'remarkable', 'distribution', 'opinions'

Display sentiments and their frequencies in the dataset, to ensure it is roughly balanced between classes

In [13]:
unique_elements, counts_elements = np.unique(header, return_counts=True)
print("Sentiments and their frequencies:")
print(unique_elements)
print(counts_elements)

Sentiments and their frequencies:
[0 1]
[1018  982]


**Featurize and Create Labels**

In [14]:
EmbeddingVectors = assemble_embedding_vectors(data_train)
print(EmbeddingVectors)

[[-0.02599724 -0.08517347 -0.16715904 ... -0.08310642 -0.14614873
  -0.01413972]
 [-0.04134955 -0.14229763 -0.07826066 ... -0.06103064 -0.09036611
  -0.01086542]
 [-0.2049824  -0.1873562  -0.00921628 ...  0.05132209 -0.07271242
   0.10262631]
 ...
 [-0.00128172 -0.08218187  0.07703825 ... -0.03221888 -0.10659809
   0.10856076]
 [ 0.32903692  0.02277061  0.17804019 ...  0.0747768   0.14600788
   0.05995347]
 [ 0.02151151 -0.05121614  0.00920535 ... -0.33328614  0.17787023
   0.18338902]]


In [15]:
data = EmbeddingVectors
del EmbeddingVectors

idx = int(0.7*data.shape[0])

# 70% of data for training
train_x = data[:idx,:]
train_y = header[:idx]
# # remaining 30% for testing
test_x = data[idx:,:]
test_y = header[idx:] 

print("train_x/train_y list details, to make sure it is of the right form:")
print(len(train_x))
print(train_x)
print(train_y[:5])
print(len(train_y))

train_x/train_y list details, to make sure it is of the right form:
1400
[[-2.59972438e-02 -8.51734728e-02 -1.67159036e-01 ... -8.31064209e-02
  -1.46148726e-01 -1.41397202e-02]
 [-4.13495526e-02 -1.42297626e-01 -7.82606602e-02 ... -6.10306412e-02
  -9.03661102e-02 -1.08654182e-02]
 [-2.04982400e-01 -1.87356204e-01 -9.21628159e-03 ...  5.13220876e-02
  -7.27124214e-02  1.02626309e-01]
 ...
 [-4.10042219e-02 -2.07076460e-01  6.78418064e-03 ... -5.20211458e-02
  -3.28047015e-02 -2.64326066e-01]
 [-3.85371745e-02 -7.63729662e-02 -9.11653042e-05 ...  1.52520820e-01
  -1.71623066e-01  1.00249372e-01]
 [ 1.03981026e-01  8.97879079e-02 -1.11229941e-01 ...  4.81342077e-02
  -1.26129508e-01  1.89728320e-01]]
[0 0 1 1 1]
1400


# Single IMDB Task Baseline

In [16]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout

input_shape = (len(train_x[0]),)
sent2vec_vectors = Input(shape=input_shape)
dense = Dense(512, activation='relu')(sent2vec_vectors)
dense = Dropout(0.3)(dense)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=sent2vec_vectors, outputs=output)

Using TensorFlow backend.


In [17]:
model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
history = model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=32,
                    nb_epoch=10, shuffle=True)

  after removing the cwd from sys.path.


Train on 1400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Add Email Task, Train Single Email Task Baseline

Read Enron dataset and get a sense for the data by printing sample messages to screen

In [18]:
# Input data files are available in the "../input/" directory.
filepath = "../input/enron-email-dataset/emails.csv"

# Read the enron data into a pandas.DataFrame called emails
emails = pd.read_csv(filepath)

print("Successfully loaded {} rows and {} columns!".format(emails.shape[0], emails.shape[1]))
print(emails.head())

Successfully loaded 517401 rows and 2 columns!
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


Separate headers from the message bodies

In [19]:
import email

def extract_messages(df):
    messages = []
    for item in df["message"]:
        # Return a message object structure from a string
        e = email.message_from_string(item)    
        # get message body  
        message_body = e.get_payload()
        messages.append(message_body)
    print("Successfully retrieved message body from e-mails!")
    return messages

bodies = extract_messages(emails)

del emails

Successfully retrieved message body from e-mails!


In [20]:
# extract random 10000 enron email bodies for building dataset
import random
bodies_df = pd.DataFrame(random.sample(bodies, 10000))

del bodies # these are huge, no longer needed, get rid of them

# expand default pandas display options to make emails more clearly visible when printed
pd.set_option('display.max_colwidth', 300)

bodies_df.head() # you could do print(bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

Unnamed: 0,0
0,"Vince,\n\nI have added a lot of material to ""fill in the wholes"" and would like your\nreaction to the current draft. I am still not very happy with the risk\nmanagement segment (primarily as a result of my own lack of knowledge) so\nplease read it carefully and get me your comments.\n\nI plan t..."
1,"The Portland Web Server will be going down for 20 min at 1:45 instead of \n12:00 noon.\n\nRegards,\nPaul"
2,"I don't understand the netting that you are referring to. Does it mean that they will pay negative ctc's up to some just and reasonable amount and anything above the j&r level will be paid out when all of the other people are paid. How do they define the ""ESP's share of the undercollection?""\n..."
3,"Group,\n\nEffective January 31st, Collin will be leaving the real time group and \ntransferring to IT. \nWe wish Collin success in his new endeavors.\n\nBill"
4,"John --\n\nThanks for the update. As long as the FERC is fully in control and the ERO is not a ""self-regulating"" organization, I assume we'd be ok. The ERO must be fully under the control of FERC.\n\nJim\n\n\n\n\n -----Original Message-----\nFrom: \tShelk, John \nSent:\tFriday, September 14, ..."


Read and Preprocess Fraudulent "419" Email Corpus

In [21]:
filepath = "../input/fraudulent-email-corpus/fradulent_emails.txt"
with open(filepath, 'r',encoding="latin1") as file:
    data = file.read()

Split on the code word `From r` appearing close to the beginning of each email

In [22]:
fraud_emails = data.split("From r")

del data

print("Successfully loaded {} spam emails!".format(len(fraud_emails)))

Successfully loaded 3978 spam emails!


In [23]:
fraud_bodies = extract_messages(pd.DataFrame(fraud_emails,columns=["message"]))

del fraud_emails

fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])

del fraud_bodies

fraud_bodies_df.head() # you could do print(fraud_bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

Successfully retrieved message body from e-mails!


Unnamed: 0,0
0,"FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-27-587908.\nE-MAIL: (james_ngola2002@maktoob.com).\n\nURGENT BUSINESS ASSISTANCE AND PARTNERSHIP.\n\n\nDEAR FRIEND,\n\nI AM ( DR.) JAMES NGOLA, THE PERSONAL ASSISTANCE TO THE LATE CONGOLESE (PRESIDENT LAURENT KABILA) WHO WAS ASSASSINATED BY HIS BODY G..."
1,"Dear Friend,\n\nI am Mr. Ben Suleman a custom officer and work as Assistant controller of the Customs and Excise department Of the Federal Ministry of Internal Affairs stationed at the Murtala Mohammed International Airport, Ikeja, Lagos-Nigeria.\n\nAfter the sudden death of the former Head of s..."
2,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
3,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
4,"Dear sir, \n \nIt is with a heart full of hope that I write to seek your help in respect of the context below. I am Mrs. Maryam Abacha the former first lady of the former Military Head of State of Nigeria General Sani Abacha whose sudden death occurred on 8th of June 1998 as a result of cardiac ..."


Convert everything to lower-case, truncate to maxtokens and truncate each token to maxtokenlen

In [24]:
EnronEmails = bodies_df.iloc[:,0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)

del bodies_df

SpamEmails = fraud_bodies_df.iloc[:,0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)

del fraud_bodies_df

raw_data = pd.concat([SpamEmails,EnronEmails], axis=0).values

In [25]:
print("Shape of combined data is:")
print(raw_data.shape)
print("Data is:")
print(raw_data)

# create corresponding labels
Categories = ['spam','notspam']
header = ([1]*Nsamp)
header.extend(([0]*Nsamp))

Shape of combined data is:
(2000,)
Data is:
[list(['emailmessagemessage', 'object', 'xfaeef', 'emailmessagemessage', 'object', 'xfab'])
 list(['my', 'deare', 'greetings', 'miss', 'fatoumata', 'saleebye', 'i', 'writting', 'letter', 'due', 'respect', 'heartful', 'tears', 'since', 'known', 'met', 'previouslyc', 'i', 'asking', 'love', 'love', 'well', 'i', 'gone', 'profile', 'pick', 'interest', 'youe', 'i', 'fair', 'complexionc', 'i', 'fit', 'in', 'tallc', 'i', 'love', 'sport', 'eventsc', 'going', 'outdoors', 'activitiesc', 'watching', 'movingc', 'shoppingc', 'walking', 'etcein', 'nutshellc', 'my', 'name', 'miss', 'fatoumata', 'saleeby', '', 'years', 'old', 'republic', 'liberia', 'west', 'africac', 'seeking', 'refugee', 'dakarsenegal', 'unhcrei', 'therefore', 'write', 'inform', 'i', 'surviving', 'child', 'daughter', 'deceased', 'dre', 'elie', 'ee', 'saleeby', 'former', 'minister', 'of', 'finance', 'f', 'executive', 'director', 'central', 'bank', 'of', 'liberia', 'year', 'july', 'febe', 'e',

We are now ready to convert these into numerical vectors!!

**Featurize and Create Labels**

In [26]:
EmbeddingVectors = assemble_embedding_vectors(raw_data)
print(EmbeddingVectors)

[[-1.11758053e+00 -7.51736999e-01  5.53557634e-01 ... -6.60767317e-01
   2.12305143e-01 -3.06020617e-01]
 [ 1.92008745e-02 -1.37052849e-01  1.80321768e-01 ...  5.39303459e-02
  -6.54170811e-02  5.28319943e-05]
 [ 3.39616202e-02 -1.59206495e-01  7.21582845e-02 ...  1.01201266e-01
  -1.98403805e-01 -6.55441135e-02]
 ...
 [-9.33180302e-02  7.55004538e-03  1.99574754e-01 ...  2.60701776e-01
  -1.69033915e-01 -3.35301040e-04]
 [-3.09811551e-02 -2.08594531e-01  9.56941098e-02 ...  6.73941001e-02
  -4.42153923e-02  5.69376582e-03]
 [-3.62663530e-02  8.49860311e-02  3.34817946e-01 ...  2.58871138e-01
  -3.68074000e-01  3.20163995e-01]]


In [27]:
# shuffle raw data first
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p,:]
    header = np.asarray(header)[p]
    return data, header

data, header = unison_shuffle_data(EmbeddingVectors, header)

idx = int(0.7*data.shape[0])

# 70% of data for training
train_x2 = data[:idx,:]
train_y2 = header[:idx]
# # remaining 30% for testing
test_x2 = data[idx:,:]
test_y2 = header[idx:] 

print("train_x2/train_y2 (emails) list details, to make sure it is of the right form:")
print(len(train_x2))
print(train_x2)
print(train_y2[:5])
print(len(train_y2))

train_x2/train_y2 (emails) list details, to make sure it is of the right form:
1400
[[ 0.03507754  0.06470443  0.07848432 ...  0.05978753 -0.23956941
  -0.02912823]
 [-0.00287206 -0.0416818  -0.09273023 ...  0.15026653 -0.02471375
   0.1012675 ]
 [-0.03710909 -0.21000084  0.2173935  ... -0.07776881 -0.15623932
   0.21470238]
 ...
 [-0.0377222  -0.07692129 -0.0833061  ...  0.21315701 -0.12499469
   0.07864657]
 [ 0.04721983 -0.05480211 -0.04854781 ... -0.04861398 -0.00052448
   0.02917188]
 [-0.21808201 -0.10511982 -0.02095625 ... -0.2459277  -0.09837516
  -0.01308274]]
[1 1 0 1 1]
1400


Train "just email" single-task shallow neural network

In [28]:
input_shape = (len(train_x2[0]),)
sent2vec_vectors = Input(shape=input_shape)
dense = Dense(512, activation='relu')(sent2vec_vectors)
dense = Dropout(0.3)(dense)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=sent2vec_vectors, outputs=output)

model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
history = model.fit(train_x2, train_y2, validation_data=(test_x2, test_y2), batch_size=32,
                    nb_epoch=10, shuffle=True)

  # This is added back by InteractiveShellApp.init_path()


Train on 1400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# "Double-Task" Email and IMDB System

In [29]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.layers.merge import concatenate

input1_shape = (len(train_x[0]),)
input2_shape = (len(train_x2[0]),)
sent2vec_vectors1 = Input(shape=input1_shape)
sent2vec_vectors2 = Input(shape=input2_shape)
combined = concatenate([sent2vec_vectors1,sent2vec_vectors2])
dense1 = Dense(512, activation='relu')(combined)
dense1 = Dropout(0.3)(dense1)
output1 = Dense(1, activation='sigmoid',name='classification1')(dense1)
output2 = Dense(1, activation='sigmoid',name='classification2')(dense1)
model = Model(inputs=[sent2vec_vectors1,sent2vec_vectors2], outputs=[output1,output2])

In [30]:
model.compile(loss={'classification1': 'binary_crossentropy', 
                    'classification2': 'binary_crossentropy'},
              optimizer='adam', metrics=['accuracy'])
history = model.fit([train_x,train_x2],[train_y,train_y2],
                    validation_data=([test_x,test_x2],[test_y,test_y2]),
                                     batch_size=32, nb_epoch=10, shuffle=True)

  


Train on 1400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
from IPython.display import HTML
def create_download_link(title = "Download file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

#create_download_link(filename='file.svg')

In [32]:
!rm -rf aclImdb
!rm aclImdb_v1.tar.gz