# Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords

from gensim import models
from gensim.corpora import Dictionary

import re
import string
import sys
import os
import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_fileslist(dirname):
    '''
    Input: A directory containing SEC articles
    Output: A list of filepaths for the original data and the cleaned data
    
    Saves the two files lists into pickle files to be loaded later
    If pickle files already exist then just open instead of creating new pickle files
    '''
    if "fileslist.pkl" not in os.listdir("."):
        files_list = list()
        clean_files_list = list()
        for data_dir in os.listdir(f"./{dirname}"):
            if data_dir.startswith("Year_"):
                for fname in os.listdir(f"{dirname}/{data_dir}"):
                    files_list.append(f"{dirname}/{data_dir}/{fname}")
                    clean_files_list.append(f"Clean_{dirname}/{data_dir}/{fname}")
        filename = f"fileslist_{dirname}.pkl"
        with open(filename, "wb") as f:
            pickle.dump(files_list, f)
        clean_filename = f"cleanfileslist_{dirname}.pkl"
        with open(clean_filename, "wb") as f:
            pickle.dump(clean_files_list, f)
    else:
        with open(f"fileslist_{dirname}.pkl", "rb") as f:
            files_list = pickle.load(f)
        with open(f"cleanfileslist_{dirname}.pkl", "rb") as f:
            clean_files_list = pickle.load(f)
    return files_list, clean_files_list

# Cleaning

In [3]:
stop = stopwords.words('english')
punkt = list(string.punctuation)

lem = WordNetLemmatizer()

def stopword_remover(x): return ' '.join(
    [word for word in x.split() if word not in (stop)])


def cleanText(text: str):
    '''
    Input: Uncleaned text as string
    Output: Cleaned text as string
    '''
    
    #Remove URL
    url_remove = re.sub(
        r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", '', text)
    lem = WordNetLemmatizer()
    sentences = sent_tokenize(text)
    document = list()
    doc_word_list = list()
    for sent in sentences:
        lemmas = list()
        #Text -> List of words
        words = word_tokenize(sent)
        pos_tags = pos_tag(words)
        word_new = list()
        for tag in pos_tags:
            if tag[1] not in ["NNP", "NNPS"]:
                #List of words -> List of words without proper nouns
                word_new.append(tag[0])
        for w in word_new:
            #List of words -> List of lemmas
            lem_word = lem.lemmatize(w)
            lemmas.append(lem_word)
        # List of lemmas -> Sentence
        sentence = ' '.join(lemmas)
        # Sentence -> List of sentences
        document.append(sentence)
    # List of sentences -> Text string
    clean_text = ' '.join(document)
    # To lower case
    clean_text = clean_text.lower()
    clean_text = re.sub(r'[^\w\s]', '', clean_text)
    clean_text = stopword_remover(clean_text)
    return clean_text

In [4]:
df = pd.DataFrame(columns=["Title", "Article_ID", "Date_Place", "Text", "Clean_Text"])

In [5]:
files_list, clean_files_list = load_fileslist("Data")
lem = WordNetLemmatizer()

if not os.path.exists("Clean_Data"):
    os.mkdir("Clean_Data")
    for dirname in os.listdir("Data"):
        os.mkdir(f"Clean_Data/{dirname}")

counter = 0
    
for fname, clean_fname in zip(files_list, clean_files_list):
    #print(f"Cleaning {fname}")
    with open(fname, "r") as f:
        content = f.read()
    if content != '':
        content = content.split("\n")
        title, art_id, place_date = content[:3]
        text = content[3:]
        text = '\n'.join(text)
        clean_text = cleanText(text)        
        sents = sent_tokenize(clean_text)
        document = list()
        for sent in sents:
            lem_words = list()
            words = word_tokenize(sent)
            for w in words:
                lem_word = lem.lemmatize(w)
                lem_words.append(lem_word)
            document += lem_words
            #print("Clean text of length:", len(clean_text))
        row = {"Title":title, "Article_ID":art_id, "Date_Place": place_date, 
               "Text":clean_text, "Clean_Text":str(document)}
        df = df.append(row, ignore_index = True)
        f1 = open(clean_fname, "w")
        f1.write(f"{title}\n{art_id}\n{place_date}\n{clean_text}\n")
        f1.close()
        counter += 1
    else:
        pass
print(f"Generated {counter} cleaned files")

Generated 2648 cleaned files


In [6]:
labels = []
fraud_words = set(["fraud", "misleading", "misled", "litigation"])

for _, row in df.iterrows():
    title = row["Title"]
    text = set(eval(row["Clean_Text"]))
    if "fraud" in title.lower(): #and text.intersection(fraud_words) != set():
        labels.append(1)
    else:
        labels.append(0)

df.insert(5, "Fraud", labels, True)

In [7]:
clean_texts = list()

for text_clean in df["Clean_Text"]:
    text_clean = eval(text_clean)
    clean_texts.append(text_clean)

In [8]:
dictionary = Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in clean_texts]
tfidf = models.TfidfModel(BoW_corpus, smartirs='ntc')
tfidf_corpus = tfidf[BoW_corpus]

In [9]:
lda_model_tfidf = models.LdaMulticore(tfidf_corpus, num_topics=3, id2word=dictionary, passes=2, workers=4)
lda_model_tfidf.save("lda.model")

In [10]:
print("Perplexity Score:", lda_model_tfidf.log_perplexity(tfidf_corpus))

Perplexity Score: -9.515253219595987


In [11]:
from pyLDAvis import gensim_models
import pyLDAvis

lda_display = gensim_models.prepare(lda_model_tfidf, BoW_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [12]:
top_list = []
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
                top_list.append(topic_num)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df, top_list

  and should_run_async(code)


In [13]:
contents = pd.DataFrame({'Original text': clean_texts})
t_df, toplist = get_topic_details(lda_model_tfidf, BoW_corpus)
topic_details = pd.concat([t_df, contents], axis=1)

# Create flag for text highest associated with topic 3
topic_details['flag'] = np.where((topic_details['Dominant_Topic'] == 1.0), 1, 0)
print(topic_details.head())

  and should_run_async(code)


   Dominant_Topic   % Score  \
0             1.0  0.521080   
1             0.0  0.499972   
2             0.0  0.994432   
3             0.0  0.994784   
4             0.0  0.845197   

                                       Original text  flag  
0  [first, joint, open, meeting, vote, rulemaking...     1  
1  [today, charged, swedish, national, living, co...     0  
2  [today, announced, filed, emergency, enforceme...     0  
3  [today, released, updated, roster, executive, ...     0  
4  [today, announced, charge, virginiabased, chie...     0  


In [14]:
from sklearn.metrics import classification_report, confusion_matrix, r2_score, roc_auc_score, f1_score

y_true = df['Fraud']
y_predict = topic_details['flag']

print(f"F1 Score: {f1_score(y_true, y_predict)}")

print('Classifcation report:\n', classification_report(y_true, y_predict))

class_names = np.array(['Non-Fraud', 'Fraud'])

F1 Score: 0.24076809453471198
Classifcation report:
               precision    recall  f1-score   support

           0       0.86      0.65      0.74      2257
           1       0.17      0.42      0.24       391

    accuracy                           0.61      2648
   macro avg       0.52      0.53      0.49      2648
weighted avg       0.76      0.61      0.67      2648



  and should_run_async(code)
