# Credit Card Fraud Detection_Text Mining

Use text data, text mining and topic modeling to detect fraudulent behavior.

In [3]:
# Import library and read csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

The text data containing emails from Enron employees. 
Enron employees covered up the bad financial position of the company, thereby keeping the stock price artificially high. 
Enron employees sold their own stock options, and when the truth came out, Enron investors were left with nothing. 
The goal is to find all emails that mention specific words, such as "sell enron stock".


In [4]:
df = pd.read_csv('data/enron_emails_clean.csv',header=0)
mask = df['clean_content'].str.contains('sell enron stock', na=False)
df[mask]

Unnamed: 0,Message-ID,From,To,Date,content,clean_content
154,<6336501.1075841154311.JavaMail.evans@thyme>,('sarah.palmer@enron.com'),('sarah.palmer@enron.com'),2002-02-01 14:53:35,\nJoint Venture: A 1997 Enron Meeting Belies O...,joint venture enron meeting belies officers cl...


### Create a list to search and flag the terms
Create an actual flag variable that gives a 1 when the emails get a hit on the search terms of interest, and 0 otherwise.

In [5]:
import numpy as np
# Create a list of terms to search for
searchfor = ['enron stock', 'sell stock', 'stock bonus', 'sell enron stock']

# Filter the cleaned emails on search for list and select from df 
filtered_emails = df[df.clean_content.str.contains('|'.join(searchfor), na=False)]

# Create flag variable where the emails match the searchfor terms
df['flag'] = np.where((df['clean_content'].str.contains('|'.join(searchfor)) == True), 1, 0)

# Count the values of the flag variable
count = df['flag'].value_counts()
print(count)

0    1776
1     314
Name: flag, dtype: int64


In [6]:
filtered_emails.head()

Unnamed: 0,Message-ID,From,To,Date,content,clean_content
0,<8345058.1075840404046.JavaMail.evans@thyme>,('advdfeedback@investools.com'),('advdfeedback@investools.com'),2002-01-29 23:20:55,INVESTools Advisory\nA Free Digest of Trusted ...,investools advisory free digest trusted invest...
1,<1512159.1075863666797.JavaMail.evans@thyme>,('richard.sanders@enron.com'),('richard.sanders@enron.com'),2000-09-20 19:07:00,----- Forwarded by Richard B Sanders/HOU/ECT o...,forwarded richard b sanders hou ect pm justin ...
2,<26118676.1075862176383.JavaMail.evans@thyme>,('m..love@enron.com'),('m..love@enron.com'),2001-10-30 16:15:17,hey you are not wearing your target purple shi...,hey wearing target purple shirt today mine wan...
3,<10369289.1075860831062.JavaMail.evans@thyme>,('leslie.milosevich@kp.org'),('leslie.milosevich@kp.org'),2002-01-30 17:54:18,Leslie Milosevich\n1042 Santa Clara Avenue\nAl...,leslie milosevich santa clara avenue alameda c...
4,<26728895.1075860815046.JavaMail.evans@thyme>,('rtwait@graphicaljazz.com'),('rtwait@graphicaljazz.com'),2002-01-30 19:36:01,"Rini Twait\n1010 E 5th Ave\nLongmont, CO 80501...",rini twait e th ave longmont co rtwait graphic...


### Text Mining
Removing stopwords; Cleaning text data


In [8]:
# Define stopwords to exclude
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
stop.update(("to", "cc", "subject", "http", "from", "sent", "ect", "u", "fwd", "www", "com", 'html'))
# Define punctuations to exclude and lemmatizer
exclude = set(string.punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chunx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

# Import the lemmatizer from nltk
lemma = WordNetLemmatizer()

def clean(text, stop):
    text = str(text).rstrip()
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(i for i in stop_free if i not in exclude)
    normalized = " ".join(lemma.lemmatize(i) for i in punc_free.split())      
    return normalized
# Clean the emails in df and print results
text_clean=[]
for text in df['clean_content']:
    text_clean.append(clean(text, stop).split())    
text_clean[0][:10]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chunx\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['investools',
 'advisory',
 'free',
 'digest',
 'trusted',
 'investment',
 'advice',
 'unsubscribe',
 'free',
 'newsletter']

### Latent Dirichlet Allocation (LDA) topic modeling

[reference](https://www.datacamp.com/community/tutorials/lda2vec-topic-model)

In [10]:
# Topic Modeling
from gensim import corpora
import gensim

# Define the dictionary
dictionary = corpora.Dictionary(text_clean)
# Filter out (non)frequent words 
dictionary.filter_extremes(no_below=10, keep_n=500000)
# Define the corpus 
corpus = [dictionary.doc2bow(text) for text in text_clean]
print(dictionary)
corpus[0][:10]

Dictionary(3730 unique tokens: ['account', 'accurate', 'address', 'advice', 'advise']...)


[(0, 2),
 (1, 1),
 (2, 1),
 (3, 6),
 (4, 1),
 (5, 2),
 (6, 4),
 (7, 1),
 (8, 2),
 (9, 1)]

In [11]:
# Define the LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Save the topics and top 10 words
topics = ldamodel.print_topics(num_words=3)

# Print the results
for topic in topics:
    print(topic)

(0, '0.065*"image" + 0.017*"se" + 0.015*"ne"')
(1, '0.012*"company" + 0.008*"hou" + 0.006*"development"')
(2, '0.065*"net" + 0.057*"money" + 0.054*"tr"')
(3, '0.018*"pm" + 0.015*"message" + 0.012*"e"')
(4, '0.017*"message" + 0.013*"thanks" + 0.013*"original"')


### Visualize the topics

In [12]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

### Assign topics to the original data 
Determine what topic a given text is about

In [13]:
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df


contents = pd.DataFrame({'Original text':text_clean})
topic_details = pd.concat([get_topic_details(ldamodel,
                           corpus), contents], axis=1)

topic_details.head(5)

Unnamed: 0,Dominant_Topic,% Score,Original text
0,1.0,0.846023,"[investools, advisory, free, digest, trusted, ..."
1,1.0,0.743088,"[forwarded, richard, b, sander, hou, pm, justi..."
2,1.0,0.422423,"[hey, wearing, target, purple, shirt, today, m..."
3,1.0,0.992657,"[leslie, milosevich, santa, clara, avenue, ala..."
4,1.0,0.992535,"[rini, twait, e, th, ave, longmont, co, rtwait..."


### Detect fraud based on topic

In [14]:
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df
# Run get_topic_details function and check the results
topic_details_df = get_topic_details(ldamodel, corpus)


# Add original text to topic details in a dataframe
contents = pd.DataFrame({'Original text': text_clean})
topic_details = pd.concat([get_topic_details(ldamodel, corpus), contents], axis=1)
topic_details.sort_values(by=['% Score'], ascending=False).head(10).head()

Unnamed: 0,Dominant_Topic,% Score,Original text
2081,2.0,0.999415,"[unsubscribe, mailing, please, go, money, net,..."
2087,0.0,0.999051,"[image, image, image, image, image, image, ima..."
1108,1.0,0.99875,"[thestreet, pf, market, detox, trade, wind, bl..."
1411,3.0,0.99852,"[inline, attachment, follows, scasey, tfsbroke..."
49,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,..."


### Create flag for text highest associated with topic 1
The topic 1 seems include the information about 'stock', 'market','price', et al.

In [15]:
topic_details['flag'] = np.where((topic_details['Dominant_Topic'] == 1.0), 1, 0)
topic_details_1 = topic_details[topic_details.flag == 1]
topic_details_1.sort_values(by=['% Score'], ascending=False).head(10)

Unnamed: 0,Dominant_Topic,% Score,Original text,flag
1108,1.0,0.99875,"[thestreet, pf, market, detox, trade, wind, bl...",1
49,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
129,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
1473,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
38,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
155,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
13,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
293,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
96,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
181,1.0,0.998179,"[today, announced, plan, merge, dynegy, major,...",1
