# Import Modules

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords

from gensim import models
from gensim.corpora import Dictionary

import re
import string
import sys
import os
import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


In [25]:
def load_fileslist(dirname):
    '''
    Input: A directory containing SEC articles
    Output: A list of filepaths for the original data and the cleaned data
    
    Saves the two files lists into pickle files to be loaded later
    If pickle files already exist then just open instead of creating new pickle files
    '''
    if "fileslist.pkl" not in os.listdir("."):
        files_list = list()
        clean_files_list = list()
        for data_dir in os.listdir(f"./{dirname}"):
            if data_dir.startswith("Year_"):
                for fname in os.listdir(f"{dirname}/{data_dir}"):
                    files_list.append(f"{dirname}/{data_dir}/{fname}")
                    clean_files_list.append(f"Clean_{dirname}/{data_dir}/{fname}")
        filename = f"fileslist_{dirname}.pkl"
        with open(filename, "wb") as f:
            pickle.dump(files_list, f)
        clean_filename = f"cleanfileslist_{dirname}.pkl"
        with open(clean_filename, "wb") as f:
            pickle.dump(clean_files_list, f)
    else:
        with open(f"fileslist_{dirname}.pkl", "rb") as f:
            files_list = pickle.load(f)
        with open(f"cleanfileslist_{dirname}.pkl", "rb") as f:
            clean_files_list = pickle.load(f)
    return files_list, clean_files_list

# Cleaning

In [26]:
stop = stopwords.words('english')
punkt = list(string.punctuation)

lem = WordNetLemmatizer()

def stopword_remover(x): return ' '.join(
    [word for word in x.split() if word not in (stop)])


def cleanText(text: str):
    '''
    Input: Uncleaned text as string
    Output: Cleaned text as string
    '''
    
    #Remove URL
    url_remove = re.sub(
        r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", '', text)
    lem = WordNetLemmatizer()
    sentences = sent_tokenize(text)
    document = list()
    doc_word_list = list()
    for sent in sentences:
        lemmas = list()
        #Text -> List of words
        words = word_tokenize(sent)
        pos_tags = pos_tag(words)
        word_new = list()
        for tag in pos_tags:
            if tag[1] not in ["NNP", "NNPS"]:
                #List of words -> List of words without proper nouns
                word_new.append(tag[0])
        for w in word_new:
            #List of words -> List of lemmas
            lem_word = lem.lemmatize(w)
            lemmas.append(lem_word)
        # List of lemmas -> Sentence
        sentence = ' '.join(lemmas)
        # Sentence -> List of sentences
        document.append(sentence)
    # List of sentences -> Text string
    clean_text = ' '.join(document)
    # To lower case
    clean_text = clean_text.lower()
    clean_text = re.sub(r'[^\w\s]', '', clean_text)
    clean_text = stopword_remover(clean_text)
    return clean_text

In [27]:
df = pd.DataFrame(columns=["Title", "Article_ID", "Date_Place", "Text", "Clean_Text"])

In [28]:
files_list, clean_files_list = load_fileslist("Data")
lem = WordNetLemmatizer()
    
for fname, clean_fname in zip(files_list, clean_files_list):
    print(f"Cleaning {fname}")
    with open(fname, "r") as f:
        content = f.read()
        if content != '':
            content = content.split("\n")
            title, art_id, place_date = content[:3]
            text = content[3:]
            text = '\n'.join(text)
            clean_text = cleanText(text)
            
            sents = sent_tokenize(clean_text)
            document = list()
            for sent in sents:
                lem_words = list()
                words = word_tokenize(sent)
                for w in words:
                    lem_word = lem.lemmatize(w)
                    lem_words.append(lem_word)
                document += lem_words
            print("Clean text of length:", len(clean_text))
            row = {"Title":title, "Article_ID":art_id, "Date_Place": place_date, 
                   "Text":clean_text, "Clean_Text":str(document)}
            df = df.append(row, ignore_index = True)
            f1 = open(clean_fname, "w")
            f1.write(f"{title}\n{art_id}\n{place_date}\n{clean_text}\n")
            f1.close()
        else:
            print(f"WARNING: {fname} is empty!")
    print(f"Generated {clean_fname}")

Cleaning Data/Year_2020/2020-264.txt
Generated Clean_Data/Year_2020/2020-264.txt
Cleaning Data/Year_2020/2020-232.txt
Clean text of length: 1275
Generated Clean_Data/Year_2020/2020-232.txt
Cleaning Data/Year_2020/2020-10.txt
Clean text of length: 1566
Generated Clean_Data/Year_2020/2020-10.txt
Cleaning Data/Year_2020/2020-108.txt
Clean text of length: 1204
Generated Clean_Data/Year_2020/2020-108.txt
Cleaning Data/Year_2020/2020-181.txt
Clean text of length: 1533
Generated Clean_Data/Year_2020/2020-181.txt
Cleaning Data/Year_2020/2020-99.txt
Clean text of length: 790
Generated Clean_Data/Year_2020/2020-99.txt
Cleaning Data/Year_2020/2020-86.txt
Clean text of length: 1433
Generated Clean_Data/Year_2020/2020-86.txt
Cleaning Data/Year_2020/2020-192.txt
Clean text of length: 2473
Generated Clean_Data/Year_2020/2020-192.txt
Cleaning Data/Year_2020/2020-41.txt
Clean text of length: 1767
Generated Clean_Data/Year_2020/2020-41.txt
Cleaning Data/Year_2020/2020-219.txt
Clean text of length: 6303


Clean text of length: 3822
Generated Clean_Data/Year_2020/2020-93.txt
Cleaning Data/Year_2020/2020-166.txt
Clean text of length: 1214
Generated Clean_Data/Year_2020/2020-166.txt
Cleaning Data/Year_2020/2020-169.txt
Clean text of length: 2491
Generated Clean_Data/Year_2020/2020-169.txt
Cleaning Data/Year_2020/2020-51.txt
Clean text of length: 860
Generated Clean_Data/Year_2020/2020-51.txt
Cleaning Data/Year_2020/2020-168.txt
Clean text of length: 1292
Generated Clean_Data/Year_2020/2020-168.txt
Cleaning Data/Year_2020/2020-344.txt
Clean text of length: 1281
Generated Clean_Data/Year_2020/2020-344.txt
Cleaning Data/Year_2020/2020-297.txt
Clean text of length: 1304
Generated Clean_Data/Year_2020/2020-297.txt
Cleaning Data/Year_2020/2020-228.txt
Clean text of length: 1355
Generated Clean_Data/Year_2020/2020-228.txt
Cleaning Data/Year_2020/2020-53.txt
Clean text of length: 3822
Generated Clean_Data/Year_2020/2020-53.txt
Cleaning Data/Year_2020/2020-276.txt
Clean text of length: 898
Generate

Clean text of length: 1609
Generated Clean_Data/Year_2020/2020-21.txt
Cleaning Data/Year_2020/2020-18.txt
Clean text of length: 1388
Generated Clean_Data/Year_2020/2020-18.txt
Cleaning Data/Year_2020/2020-17.txt
Clean text of length: 1306
Generated Clean_Data/Year_2020/2020-17.txt
Cleaning Data/Year_2020/2020-149.txt
Clean text of length: 1339
Generated Clean_Data/Year_2020/2020-149.txt
Cleaning Data/Year_2020/2020-193.txt
Clean text of length: 1597
Generated Clean_Data/Year_2020/2020-193.txt
Cleaning Data/Year_2020/2020-129.txt
Clean text of length: 789
Generated Clean_Data/Year_2020/2020-129.txt
Cleaning Data/Year_2020/2020-201.txt
Clean text of length: 1243
Generated Clean_Data/Year_2020/2020-201.txt
Cleaning Data/Year_2020/2020-157.txt
Clean text of length: 1416
Generated Clean_Data/Year_2020/2020-157.txt
Cleaning Data/Year_2020/2020-335.txt
Clean text of length: 6746
Generated Clean_Data/Year_2020/2020-335.txt
Cleaning Data/Year_2020/2020-230.txt
Clean text of length: 1108
Generat

Clean text of length: 5850
Generated Clean_Data/Year_2020/2020-161.txt
Cleaning Data/Year_2020/2020-83.txt
Clean text of length: 6016
Generated Clean_Data/Year_2020/2020-83.txt
Cleaning Data/Year_2020/2020-121.txt
Clean text of length: 1240
Generated Clean_Data/Year_2020/2020-121.txt
Cleaning Data/Year_2020/2020-274.txt
Clean text of length: 2301
Generated Clean_Data/Year_2020/2020-274.txt
Cleaning Data/Year_2020/2020-199.txt
Clean text of length: 1074
Generated Clean_Data/Year_2020/2020-199.txt
Cleaning Data/Year_2020/2020-271.txt
Clean text of length: 1344
Generated Clean_Data/Year_2020/2020-271.txt
Cleaning Data/Year_2020/2020-43.txt
Clean text of length: 1864
Generated Clean_Data/Year_2020/2020-43.txt
Cleaning Data/Year_2020/2020-205.txt
Clean text of length: 1849
Generated Clean_Data/Year_2020/2020-205.txt
Cleaning Data/Year_2020/2020-182.txt
Clean text of length: 1771
Generated Clean_Data/Year_2020/2020-182.txt
Cleaning Data/Year_2020/2020-133.txt
Clean text of length: 1223
Gener

Clean text of length: 3528
Generated Clean_Data/Year_2020/2020-245-0.txt
Cleaning Data/Year_2020/2020-2.txt
Clean text of length: 1694
Generated Clean_Data/Year_2020/2020-2.txt
Cleaning Data/Year_2020/2020-330.txt
Clean text of length: 1710
Generated Clean_Data/Year_2020/2020-330.txt
Cleaning Data/Year_2020/2020-59.txt
Clean text of length: 1266
Generated Clean_Data/Year_2020/2020-59.txt
Cleaning Data/Year_2020/2020-3.txt
Clean text of length: 945
Generated Clean_Data/Year_2020/2020-3.txt
Cleaning Data/Year_2020/2020-227.txt
Clean text of length: 6540
Generated Clean_Data/Year_2020/2020-227.txt
Cleaning Data/Year_2020/2020-144.txt
Clean text of length: 1276
Generated Clean_Data/Year_2020/2020-144.txt
Cleaning Data/Year_2020/2020-300.txt
Clean text of length: 1399
Generated Clean_Data/Year_2020/2020-300.txt
Cleaning Data/Year_2020/2020-159.txt
Clean text of length: 1897
Generated Clean_Data/Year_2020/2020-159.txt
Cleaning Data/Year_2020/2020-102.txt
Clean text of length: 1141
Generated 

Clean text of length: 2156
Generated Clean_Data/Year_2018/2018-71.txt
Cleaning Data/Year_2018/2018-202.txt
Clean text of length: 916
Generated Clean_Data/Year_2018/2018-202.txt
Cleaning Data/Year_2018/2018-178.txt
Clean text of length: 1281
Generated Clean_Data/Year_2018/2018-178.txt
Cleaning Data/Year_2018/2018-74.txt
Clean text of length: 1674
Generated Clean_Data/Year_2018/2018-74.txt
Cleaning Data/Year_2018/2018-293.txt
Clean text of length: 2376
Generated Clean_Data/Year_2018/2018-293.txt
Cleaning Data/Year_2018/2018-281.txt
Clean text of length: 1459
Generated Clean_Data/Year_2018/2018-281.txt
Cleaning Data/Year_2018/2018-135.txt
Clean text of length: 2118
Generated Clean_Data/Year_2018/2018-135.txt
Cleaning Data/Year_2018/2018-20.txt
Clean text of length: 1125
Generated Clean_Data/Year_2018/2018-20.txt
Cleaning Data/Year_2018/2018-290.txt
Clean text of length: 1533
Generated Clean_Data/Year_2018/2018-290.txt
Cleaning Data/Year_2018/2018-232.txt
Clean text of length: 1647
Generat

Clean text of length: 1034
Generated Clean_Data/Year_2018/2018-288.txt
Cleaning Data/Year_2018/2018-98.txt
Clean text of length: 1312
Generated Clean_Data/Year_2018/2018-98.txt
Cleaning Data/Year_2018/2018-195.txt
Clean text of length: 1160
Generated Clean_Data/Year_2018/2018-195.txt
Cleaning Data/Year_2018/2018-84.txt
Clean text of length: 921
Generated Clean_Data/Year_2018/2018-84.txt
Cleaning Data/Year_2018/2018-113.txt
Clean text of length: 1159
Generated Clean_Data/Year_2018/2018-113.txt
Cleaning Data/Year_2018/2018-68.txt
Clean text of length: 4741
Generated Clean_Data/Year_2018/2018-68.txt
Cleaning Data/Year_2018/2018-156.txt
Clean text of length: 1602
Generated Clean_Data/Year_2018/2018-156.txt
Cleaning Data/Year_2018/2018-118.txt
Clean text of length: 3314
Generated Clean_Data/Year_2018/2018-118.txt
Cleaning Data/Year_2018/2018-243.txt
Clean text of length: 2087
Generated Clean_Data/Year_2018/2018-243.txt
Cleaning Data/Year_2018/2018-55.txt
Clean text of length: 1188
Generated

Generated Clean_Data/Year_2018/2018-161.txt
Cleaning Data/Year_2018/2018-25.txt
Clean text of length: 1005
Generated Clean_Data/Year_2018/2018-25.txt
Cleaning Data/Year_2018/2018-152.txt
Clean text of length: 1379
Generated Clean_Data/Year_2018/2018-152.txt
Cleaning Data/Year_2018/2018-86.txt
Clean text of length: 983
Generated Clean_Data/Year_2018/2018-86.txt
Cleaning Data/Year_2018/2018-129.txt
Clean text of length: 953
Generated Clean_Data/Year_2018/2018-129.txt
Cleaning Data/Year_2018/2018-228.txt
Clean text of length: 1806
Generated Clean_Data/Year_2018/2018-228.txt
Cleaning Data/Year_2018/2018-23.txt
Clean text of length: 1436
Generated Clean_Data/Year_2018/2018-23.txt
Cleaning Data/Year_2018/2018-117.txt
Clean text of length: 1487
Generated Clean_Data/Year_2018/2018-117.txt
Cleaning Data/Year_2018/2018-175.txt
Clean text of length: 1148
Generated Clean_Data/Year_2018/2018-175.txt
Cleaning Data/Year_2018/2018-38.txt
Clean text of length: 1250
Generated Clean_Data/Year_2018/2018-3

Clean text of length: 1373
Generated Clean_Data/Year_2018/2018-276.txt
Cleaning Data/Year_2018/2018-157.txt
Clean text of length: 2000
Generated Clean_Data/Year_2018/2018-157.txt
Cleaning Data/Year_2018/2018-292.txt
Clean text of length: 2139
Generated Clean_Data/Year_2018/2018-292.txt
Cleaning Data/Year_2021/2021-1.txt
Clean text of length: 1763
Generated Clean_Data/Year_2021/2021-1.txt
Cleaning Data/Year_2021/2021-44.txt
Clean text of length: 1059
Generated Clean_Data/Year_2021/2021-44.txt
Cleaning Data/Year_2021/2021-2.txt
Clean text of length: 2036
Generated Clean_Data/Year_2021/2021-2.txt
Cleaning Data/Year_2021/2021-49.txt
Clean text of length: 1811
Generated Clean_Data/Year_2021/2021-49.txt
Cleaning Data/Year_2021/2021-40.txt
Clean text of length: 899
Generated Clean_Data/Year_2021/2021-40.txt
Cleaning Data/Year_2021/2021-36.txt
Clean text of length: 615
Generated Clean_Data/Year_2021/2021-36.txt
Cleaning Data/Year_2021/2021-47.txt
Clean text of length: 940
Generated Clean_Data/

Generated Clean_Data/Year_2019/2019-111.txt
Cleaning Data/Year_2019/2019-179.txt
Clean text of length: 1375
Generated Clean_Data/Year_2019/2019-179.txt
Cleaning Data/Year_2019/2019-142.txt
Clean text of length: 1642
Generated Clean_Data/Year_2019/2019-142.txt
Cleaning Data/Year_2019/2019-267.txt
Clean text of length: 1404
Generated Clean_Data/Year_2019/2019-267.txt
Cleaning Data/Year_2019/2019-236.txt
Clean text of length: 1355
Generated Clean_Data/Year_2019/2019-236.txt
Cleaning Data/Year_2019/2019-204.txt
Clean text of length: 1961
Generated Clean_Data/Year_2019/2019-204.txt
Cleaning Data/Year_2019/2019-16.txt
Clean text of length: 1395
Generated Clean_Data/Year_2019/2019-16.txt
Cleaning Data/Year_2019/2019-6.txt
Clean text of length: 1163
Generated Clean_Data/Year_2019/2019-6.txt
Cleaning Data/Year_2019/2019-177.txt
Clean text of length: 1402
Generated Clean_Data/Year_2019/2019-177.txt
Cleaning Data/Year_2019/2019-250.txt
Clean text of length: 581
Generated Clean_Data/Year_2019/2019

Clean text of length: 1292
Generated Clean_Data/Year_2019/2019-244.txt
Cleaning Data/Year_2019/2019-26.txt
Clean text of length: 1539
Generated Clean_Data/Year_2019/2019-26.txt
Cleaning Data/Year_2019/2019-194.txt
Clean text of length: 1893
Generated Clean_Data/Year_2019/2019-194.txt
Cleaning Data/Year_2019/2019-9.txt
Clean text of length: 1938
Generated Clean_Data/Year_2019/2019-9.txt
Cleaning Data/Year_2019/2019-7.txt
Clean text of length: 917
Generated Clean_Data/Year_2019/2019-7.txt
Cleaning Data/Year_2019/2019-76.txt
Clean text of length: 1748
Generated Clean_Data/Year_2019/2019-76.txt
Cleaning Data/Year_2019/2019-11.txt
Clean text of length: 410
Generated Clean_Data/Year_2019/2019-11.txt
Cleaning Data/Year_2019/2019-159.txt
Clean text of length: 1187
Generated Clean_Data/Year_2019/2019-159.txt
Cleaning Data/Year_2019/2019-2.txt
Clean text of length: 1552
Generated Clean_Data/Year_2019/2019-2.txt
Cleaning Data/Year_2019/2019-27.txt
Clean text of length: 1485
Generated Clean_Data/Y

Clean text of length: 1590
Generated Clean_Data/Year_2019/2019-166.txt
Cleaning Data/Year_2019/2019-262.txt
Clean text of length: 2841
Generated Clean_Data/Year_2019/2019-262.txt
Cleaning Data/Year_2019/2019-144.txt
Clean text of length: 1582
Generated Clean_Data/Year_2019/2019-144.txt
Cleaning Data/Year_2019/2019-261.txt
Clean text of length: 1301
Generated Clean_Data/Year_2019/2019-261.txt
Cleaning Data/Year_2019/2019-153.txt
Clean text of length: 1394
Generated Clean_Data/Year_2019/2019-153.txt
Cleaning Data/Year_2019/2019-91.txt
Clean text of length: 1477
Generated Clean_Data/Year_2019/2019-91.txt
Cleaning Data/Year_2019/2019-42.txt
Clean text of length: 1274
Generated Clean_Data/Year_2019/2019-42.txt
Cleaning Data/Year_2019/2019-203.txt
Clean text of length: 1663
Generated Clean_Data/Year_2019/2019-203.txt
Cleaning Data/Year_2019/2019-187.txt
Clean text of length: 1062
Generated Clean_Data/Year_2019/2019-187.txt
Cleaning Data/Year_2019/2019-18.txt
Clean text of length: 874
Generat

Clean text of length: 1539
Generated Clean_Data/Year_2019/2019-195.txt
Cleaning Data/Year_2019/2019-173.txt
Clean text of length: 1022
Generated Clean_Data/Year_2019/2019-173.txt
Cleaning Data/Year_2019/2019-10.txt
Clean text of length: 1307
Generated Clean_Data/Year_2019/2019-10.txt
Cleaning Data/Year_2019/2019-99.txt
Clean text of length: 1294
Generated Clean_Data/Year_2019/2019-99.txt
Cleaning Data/Year_2019/2019-221.txt
Clean text of length: 848
Generated Clean_Data/Year_2019/2019-221.txt
Cleaning Data/Year_2019/2019-133.txt
Clean text of length: 1570
Generated Clean_Data/Year_2019/2019-133.txt
Cleaning Data/Year_2012/2012-2012-40htm.txt
Clean text of length: 1964
Generated Clean_Data/Year_2012/2012-2012-40htm.txt
Cleaning Data/Year_2012/2012-2012-134htm.txt
Clean text of length: 1686
Generated Clean_Data/Year_2012/2012-2012-134htm.txt
Cleaning Data/Year_2012/2012-2012-46htm.txt
Clean text of length: 1907
Generated Clean_Data/Year_2012/2012-2012-46htm.txt
Cleaning Data/Year_2012/20

Clean text of length: 1724
Generated Clean_Data/Year_2012/2012-2012-38htm.txt
Cleaning Data/Year_2012/2012-2012-51htm.txt
Clean text of length: 977
Generated Clean_Data/Year_2012/2012-2012-51htm.txt
Cleaning Data/Year_2012/2012-2012-142htm.txt
Clean text of length: 2378
Generated Clean_Data/Year_2012/2012-2012-142htm.txt
Cleaning Data/Year_2012/2012-2012-61htm.txt
Clean text of length: 2879
Generated Clean_Data/Year_2012/2012-2012-61htm.txt
Cleaning Data/Year_2012/2012-2012-233htm.txt
Clean text of length: 6145
Generated Clean_Data/Year_2012/2012-2012-233htm.txt
Cleaning Data/Year_2012/2012-2012-220htm.txt
Clean text of length: 807
Generated Clean_Data/Year_2012/2012-2012-220htm.txt
Cleaning Data/Year_2012/2012-2012-151htm.txt
Clean text of length: 1200
Generated Clean_Data/Year_2012/2012-2012-151htm.txt
Cleaning Data/Year_2012/2012-2012-271htm.txt
Clean text of length: 2780
Generated Clean_Data/Year_2012/2012-2012-271htm.txt
Cleaning Data/Year_2012/2012-2012-98htm.txt
Clean text of le

Clean text of length: 2965
Generated Clean_Data/Year_2012/2012-2012-159htm.txt
Cleaning Data/Year_2012/2012-2012-148htm.txt
Clean text of length: 3018
Generated Clean_Data/Year_2012/2012-2012-148htm.txt
Cleaning Data/Year_2012/2012-2012-48htm.txt
Clean text of length: 1627
Generated Clean_Data/Year_2012/2012-2012-48htm.txt
Cleaning Data/Year_2012/2012-2012-167htm.txt
Clean text of length: 2082
Generated Clean_Data/Year_2012/2012-2012-167htm.txt
Cleaning Data/Year_2012/2012-2012-111htm.txt
Clean text of length: 1161
Generated Clean_Data/Year_2012/2012-2012-111htm.txt
Cleaning Data/Year_2012/2012-2012-161htm.txt
Clean text of length: 1417
Generated Clean_Data/Year_2012/2012-2012-161htm.txt
Cleaning Data/Year_2012/2012-2012-22htm.txt
Clean text of length: 2741
Generated Clean_Data/Year_2012/2012-2012-22htm.txt
Cleaning Data/Year_2012/2012-2012-157htm.txt
Clean text of length: 2065
Generated Clean_Data/Year_2012/2012-2012-157htm.txt
Cleaning Data/Year_2012/2012-2012-112htm.txt
Clean text o

Generated Clean_Data/Year_2012/2012-2012-50htm.txt
Cleaning Data/Year_2012/2012-2012-87htm.txt
Clean text of length: 2161
Generated Clean_Data/Year_2012/2012-2012-87htm.txt
Cleaning Data/Year_2012/2012-2012-145htm.txt
Clean text of length: 1947
Generated Clean_Data/Year_2012/2012-2012-145htm.txt
Cleaning Data/Year_2012/2012-2012-274htm.txt
Clean text of length: 1356
Generated Clean_Data/Year_2012/2012-2012-274htm.txt
Cleaning Data/Year_2012/2012-2012-99htm.txt
Clean text of length: 2637
Generated Clean_Data/Year_2012/2012-2012-99htm.txt
Cleaning Data/Year_2012/2012-2012-195htm.txt
Clean text of length: 2363
Generated Clean_Data/Year_2012/2012-2012-195htm.txt
Cleaning Data/Year_2012/2012-2012-107htm.txt
Clean text of length: 2706
Generated Clean_Data/Year_2012/2012-2012-107htm.txt
Cleaning Data/Year_2012/2012-2012-32htm.txt
Clean text of length: 2755
Generated Clean_Data/Year_2012/2012-2012-32htm.txt
Cleaning Data/Year_2012/2012-2012-118htm.txt
Clean text of length: 1519
Generated Clean

Clean text of length: 2365
Generated Clean_Data/Year_2012/2012-2012-156htm.txt
Cleaning Data/Year_2012/2012-2012-227htm.txt
Clean text of length: 4999
Generated Clean_Data/Year_2012/2012-2012-227htm.txt
Cleaning Data/Year_2012/2012-2012-96htm.txt
Clean text of length: 1597
Generated Clean_Data/Year_2012/2012-2012-96htm.txt
Cleaning Data/Year_2012/2012-2012-181htm.txt
Clean text of length: 2992
Generated Clean_Data/Year_2012/2012-2012-181htm.txt
Cleaning Data/Year_2012/2012-2012-128htm.txt
Clean text of length: 1278
Generated Clean_Data/Year_2012/2012-2012-128htm.txt
Cleaning Data/Year_2012/2012-2012-29htm.txt
Clean text of length: 1529
Generated Clean_Data/Year_2012/2012-2012-29htm.txt
Cleaning Data/Year_2012/2012-2012-54htm.txt
Clean text of length: 1422
Generated Clean_Data/Year_2012/2012-2012-54htm.txt
Cleaning Data/Year_2012/2012-2012-244htm.txt
Clean text of length: 1645
Generated Clean_Data/Year_2012/2012-2012-244htm.txt
Cleaning Data/Year_2012/2012-2012-205htm.txt
Clean text of 

Generated Clean_Data/Year_2015/2015-196.txt
Cleaning Data/Year_2015/2015-61.txt
Clean text of length: 1325
Generated Clean_Data/Year_2015/2015-61.txt
Cleaning Data/Year_2015/2015-263.txt
Clean text of length: 1184
Generated Clean_Data/Year_2015/2015-263.txt
Cleaning Data/Year_2015/2015-184.txt
Clean text of length: 1716
Generated Clean_Data/Year_2015/2015-184.txt
Cleaning Data/Year_2015/2015-18.txt
Clean text of length: 1585
Generated Clean_Data/Year_2015/2015-18.txt
Cleaning Data/Year_2015/2015-37.txt
Clean text of length: 1396
Generated Clean_Data/Year_2015/2015-37.txt
Cleaning Data/Year_2015/2015-38.txt
Clean text of length: 1390
Generated Clean_Data/Year_2015/2015-38.txt
Cleaning Data/Year_2015/2015-123.txt
Clean text of length: 1594
Generated Clean_Data/Year_2015/2015-123.txt
Cleaning Data/Year_2015/2015-240.txt
Clean text of length: 1544
Generated Clean_Data/Year_2015/2015-240.txt
Cleaning Data/Year_2015/2015-286.txt
Clean text of length: 715
Generated Clean_Data/Year_2015/2015-2

Clean text of length: 1431
Generated Clean_Data/Year_2015/2015-187.txt
Cleaning Data/Year_2015/2015-21.txt
Clean text of length: 795
Generated Clean_Data/Year_2015/2015-21.txt
Cleaning Data/Year_2015/2015-129.txt
Clean text of length: 1024
Generated Clean_Data/Year_2015/2015-129.txt
Cleaning Data/Year_2015/2015-75.txt
Clean text of length: 1886
Generated Clean_Data/Year_2015/2015-75.txt
Cleaning Data/Year_2015/2015-79.txt
Clean text of length: 702
Generated Clean_Data/Year_2015/2015-79.txt
Cleaning Data/Year_2015/2015-169.txt
Clean text of length: 1175
Generated Clean_Data/Year_2015/2015-169.txt
Cleaning Data/Year_2015/2015-276.txt
Clean text of length: 4756
Generated Clean_Data/Year_2015/2015-276.txt
Cleaning Data/Year_2015/2015-202.txt
Clean text of length: 1434
Generated Clean_Data/Year_2015/2015-202.txt
Cleaning Data/Year_2015/2015-219.txt
Clean text of length: 946
Generated Clean_Data/Year_2015/2015-219.txt
Cleaning Data/Year_2015/2015-46.txt
Clean text of length: 1774
Generated C

Clean text of length: 761
Generated Clean_Data/Year_2015/2015-83.txt
Cleaning Data/Year_2015/2015-234.txt
Clean text of length: 1443
Generated Clean_Data/Year_2015/2015-234.txt
Cleaning Data/Year_2015/2015-50.txt
Clean text of length: 1741
Generated Clean_Data/Year_2015/2015-50.txt
Cleaning Data/Year_2015/2015-70.txt
Clean text of length: 1179
Generated Clean_Data/Year_2015/2015-70.txt
Cleaning Data/Year_2015/2015-5.txt
Clean text of length: 1006
Generated Clean_Data/Year_2015/2015-5.txt
Cleaning Data/Year_2015/2015-65.txt
Clean text of length: 2411
Generated Clean_Data/Year_2015/2015-65.txt
Cleaning Data/Year_2015/2015-227.txt
Clean text of length: 1669
Generated Clean_Data/Year_2015/2015-227.txt
Cleaning Data/Year_2015/2015-74.txt
Clean text of length: 1298
Generated Clean_Data/Year_2015/2015-74.txt
Cleaning Data/Year_2015/2015-116.txt
Clean text of length: 1413
Generated Clean_Data/Year_2015/2015-116.txt
Cleaning Data/Year_2015/2015-287.txt
Clean text of length: 1764
Generated Clean

Clean text of length: 1494
Generated Clean_Data/Year_2016/2016-83.txt
Cleaning Data/Year_2016/2016-96.txt
Clean text of length: 1056
Generated Clean_Data/Year_2016/2016-96.txt
Cleaning Data/Year_2016/2016-149.txt
Clean text of length: 2187
Generated Clean_Data/Year_2016/2016-149.txt
Cleaning Data/Year_2016/2016-151.txt
Clean text of length: 1368
Generated Clean_Data/Year_2016/2016-151.txt
Cleaning Data/Year_2016/2016-168.txt
Clean text of length: 1224
Generated Clean_Data/Year_2016/2016-168.txt
Cleaning Data/Year_2016/2016-264.txt
Clean text of length: 1783
Generated Clean_Data/Year_2016/2016-264.txt
Cleaning Data/Year_2016/2016-25.txt
Clean text of length: 3881
Generated Clean_Data/Year_2016/2016-25.txt
Cleaning Data/Year_2016/2016-161.txt
Clean text of length: 1257
Generated Clean_Data/Year_2016/2016-161.txt
Cleaning Data/Year_2016/2016-155.txt
Clean text of length: 2425
Generated Clean_Data/Year_2016/2016-155.txt
Cleaning Data/Year_2016/2016-275.txt
Clean text of length: 931
Generat

Generated Clean_Data/Year_2016/2016-203.txt
Cleaning Data/Year_2016/2016-267.txt
Clean text of length: 1863
Generated Clean_Data/Year_2016/2016-267.txt
Cleaning Data/Year_2016/2016-248.txt
Clean text of length: 807
Generated Clean_Data/Year_2016/2016-248.txt
Cleaning Data/Year_2016/2016-65.txt
Clean text of length: 2058
Generated Clean_Data/Year_2016/2016-65.txt
Cleaning Data/Year_2016/2016-37.txt
Clean text of length: 1607
Generated Clean_Data/Year_2016/2016-37.txt
Cleaning Data/Year_2016/2016-194.txt
Clean text of length: 1516
Generated Clean_Data/Year_2016/2016-194.txt
Cleaning Data/Year_2016/2016-38.txt
Clean text of length: 1146
Generated Clean_Data/Year_2016/2016-38.txt
Cleaning Data/Year_2016/2016-263.txt
Clean text of length: 2454
Generated Clean_Data/Year_2016/2016-263.txt
Cleaning Data/Year_2016/2016-36.txt
Clean text of length: 1071
Generated Clean_Data/Year_2016/2016-36.txt
Cleaning Data/Year_2016/2016-214.txt
Clean text of length: 1102
Generated Clean_Data/Year_2016/2016-2

Generated Clean_Data/Year_2016/2016-136.txt
Cleaning Data/Year_2016/2016-230.txt
Clean text of length: 1118
Generated Clean_Data/Year_2016/2016-230.txt
Cleaning Data/Year_2016/2016-47.txt
Clean text of length: 1818
Generated Clean_Data/Year_2016/2016-47.txt
Cleaning Data/Year_2016/2016-147.txt
Clean text of length: 1620
Generated Clean_Data/Year_2016/2016-147.txt
Cleaning Data/Year_2016/2016-241.txt
Clean text of length: 1432
Generated Clean_Data/Year_2016/2016-241.txt
Cleaning Data/Year_2016/2016-242.txt
Clean text of length: 1466
Generated Clean_Data/Year_2016/2016-242.txt
Cleaning Data/Year_2016/2016-45.txt
Clean text of length: 1493
Generated Clean_Data/Year_2016/2016-45.txt
Cleaning Data/Year_2016/2016-123.txt
Clean text of length: 1679
Generated Clean_Data/Year_2016/2016-123.txt
Cleaning Data/Year_2016/2016-81.txt
Clean text of length: 2218
Generated Clean_Data/Year_2016/2016-81.txt
Cleaning Data/Year_2016/2016-68.txt
Clean text of length: 2134
Generated Clean_Data/Year_2016/2016

Clean text of length: 1553
Generated Clean_Data/Year_2016/2016-269.txt
Cleaning Data/Year_2016/2016-282.txt
Clean text of length: 1420
Generated Clean_Data/Year_2016/2016-282.txt
Cleaning Data/Year_2016/2016-108.txt
Clean text of length: 941
Generated Clean_Data/Year_2016/2016-108.txt
Cleaning Data/Year_2016/2016-164.txt
Clean text of length: 1220
Generated Clean_Data/Year_2016/2016-164.txt
Cleaning Data/Year_2016/2016-229.txt
Clean text of length: 1503
Generated Clean_Data/Year_2016/2016-229.txt
Cleaning Data/Year_2016/2016-245.txt
Clean text of length: 2572
Generated Clean_Data/Year_2016/2016-245.txt
Cleaning Data/Year_2016/2016-103.txt
Clean text of length: 2416
Generated Clean_Data/Year_2016/2016-103.txt
Cleaning Data/Year_2016/2016-120.txt
Clean text of length: 916
Generated Clean_Data/Year_2016/2016-120.txt
Cleaning Data/Year_2016/2016-82.txt
Clean text of length: 170
Generated Clean_Data/Year_2016/2016-82.txt
Cleaning Data/Year_2016/2016-217.txt
Clean text of length: 991
Generat

Clean text of length: 963
Generated Clean_Data/Year_2013/2013-2013-70htm.txt
Cleaning Data/Year_2013/2013-193.txt
Clean text of length: 4726
Generated Clean_Data/Year_2013/2013-193.txt
Cleaning Data/Year_2013/2013-2013-17htm.txt
Clean text of length: 1334
Generated Clean_Data/Year_2013/2013-2013-17htm.txt
Cleaning Data/Year_2013/2013-241.txt
Clean text of length: 1886
Generated Clean_Data/Year_2013/2013-241.txt
Cleaning Data/Year_2013/2013-2013-53htm.txt
Clean text of length: 2516
Generated Clean_Data/Year_2013/2013-2013-53htm.txt
Cleaning Data/Year_2013/2013-2013-122htm.txt
Clean text of length: 1547
Generated Clean_Data/Year_2013/2013-2013-122htm.txt
Cleaning Data/Year_2013/2013-254.txt
Clean text of length: 1981
Generated Clean_Data/Year_2013/2013-254.txt
Cleaning Data/Year_2013/2013-2013-13htm.txt
Clean text of length: 2399
Generated Clean_Data/Year_2013/2013-2013-13htm.txt
Cleaning Data/Year_2013/2013-255.txt
Clean text of length: 1940
Generated Clean_Data/Year_2013/2013-255.txt
C

Generated Clean_Data/Year_2013/2013-152.txt
Cleaning Data/Year_2013/2013-214.txt
Clean text of length: 650
Generated Clean_Data/Year_2013/2013-214.txt
Cleaning Data/Year_2013/2013-199.txt
Clean text of length: 3315
Generated Clean_Data/Year_2013/2013-199.txt
Cleaning Data/Year_2013/2013-2013-27htm.txt
Clean text of length: 309
Generated Clean_Data/Year_2013/2013-2013-27htm.txt
Cleaning Data/Year_2013/2013-244.txt
Clean text of length: 2710
Generated Clean_Data/Year_2013/2013-244.txt
Cleaning Data/Year_2013/2013-211.txt
Clean text of length: 2386
Generated Clean_Data/Year_2013/2013-211.txt
Cleaning Data/Year_2013/2013-267.txt
Clean text of length: 491
Generated Clean_Data/Year_2013/2013-267.txt
Cleaning Data/Year_2013/2013-2013-35htm.txt
Clean text of length: 988
Generated Clean_Data/Year_2013/2013-2013-35htm.txt
Cleaning Data/Year_2013/mark-kronforst-named-chief-accountant-division-corporation.txt
Clean text of length: 755
Generated Clean_Data/Year_2013/mark-kronforst-named-chief-accou

Clean text of length: 1826
Generated Clean_Data/Year_2013/2013-2013-79htm.txt
Cleaning Data/Year_2013/2013-125-sec-halts-texas-based-forex-trading-scheme.txt
Clean text of length: 2334
Generated Clean_Data/Year_2013/2013-125-sec-halts-texas-based-forex-trading-scheme.txt
Cleaning Data/Year_2013/2013-2013-7htm.txt
Clean text of length: 2057
Generated Clean_Data/Year_2013/2013-2013-7htm.txt
Cleaning Data/Year_2013/2013-154.txt
Clean text of length: 2245
Generated Clean_Data/Year_2013/2013-154.txt
Cleaning Data/Year_2013/2013-2013-61htm.txt
Clean text of length: 1649
Generated Clean_Data/Year_2013/2013-2013-61htm.txt
Cleaning Data/Year_2013/2013-2013-49htm.txt
Clean text of length: 2144
Generated Clean_Data/Year_2013/2013-2013-49htm.txt
Cleaning Data/Year_2013/2013-2013-75htm.txt
Clean text of length: 2426
Generated Clean_Data/Year_2013/2013-2013-75htm.txt
Cleaning Data/Year_2013/2013-189.txt
Clean text of length: 2695
Generated Clean_Data/Year_2013/2013-189.txt
Cleaning Data/Year_2013/20

Clean text of length: 1971
Generated Clean_Data/Year_2013/2013-259.txt
Cleaning Data/Year_2013/2013-183.txt
Clean text of length: 2547
Generated Clean_Data/Year_2013/2013-183.txt
Cleaning Data/Year_2013/2013-2013-65htm.txt
Clean text of length: 2229
Generated Clean_Data/Year_2013/2013-2013-65htm.txt
Cleaning Data/Year_2013/2013-274.txt
Clean text of length: 189
Generated Clean_Data/Year_2013/2013-274.txt
Cleaning Data/Year_2013/2013-2013-33htm.txt
Clean text of length: 1571
Generated Clean_Data/Year_2013/2013-2013-33htm.txt
Cleaning Data/Year_2013/2013-167.txt
Clean text of length: 1419
Generated Clean_Data/Year_2013/2013-167.txt
Cleaning Data/Year_2013/2013-205.txt
Clean text of length: 1824
Generated Clean_Data/Year_2013/2013-205.txt
Cleaning Data/Year_2014/2014-32.txt
Clean text of length: 504
Generated Clean_Data/Year_2014/2014-32.txt
Cleaning Data/Year_2014/2014-76.txt
Clean text of length: 1335
Generated Clean_Data/Year_2014/2014-76.txt
Cleaning Data/Year_2014/2014-80.txt
Clean t

Clean text of length: 1236
Generated Clean_Data/Year_2014/2014-9.txt
Cleaning Data/Year_2014/2014-176.txt
Clean text of length: 1082
Generated Clean_Data/Year_2014/2014-176.txt
Cleaning Data/Year_2014/2014-291.txt
Clean text of length: 1173
Generated Clean_Data/Year_2014/2014-291.txt
Cleaning Data/Year_2014/2014-243.txt
Clean text of length: 938
Generated Clean_Data/Year_2014/2014-243.txt
Cleaning Data/Year_2014/2014-280.txt
Clean text of length: 2217
Generated Clean_Data/Year_2014/2014-280.txt
Cleaning Data/Year_2014/2014-172.txt
Clean text of length: 2796
Generated Clean_Data/Year_2014/2014-172.txt
Cleaning Data/Year_2014/2014-247.txt
Clean text of length: 1827
Generated Clean_Data/Year_2014/2014-247.txt
Cleaning Data/Year_2014/2014-208.txt
Clean text of length: 1426
Generated Clean_Data/Year_2014/2014-208.txt
Cleaning Data/Year_2014/2014-42.txt
Clean text of length: 538
Generated Clean_Data/Year_2014/2014-42.txt
Cleaning Data/Year_2014/2014-115.txt
Clean text of length: 2308
Generat

Generated Clean_Data/Year_2014/2014-109.txt
Cleaning Data/Year_2014/2014-262.txt
Clean text of length: 1476
Generated Clean_Data/Year_2014/2014-262.txt
Cleaning Data/Year_2014/2014-264.txt
Clean text of length: 2024
Generated Clean_Data/Year_2014/2014-264.txt
Cleaning Data/Year_2014/2014-106.txt
Clean text of length: 2170
Generated Clean_Data/Year_2014/2014-106.txt
Cleaning Data/Year_2014/2014-180.txt
Clean text of length: 1220
Generated Clean_Data/Year_2014/2014-180.txt
Cleaning Data/Year_2014/2014-266.txt
Clean text of length: 1915
Generated Clean_Data/Year_2014/2014-266.txt
Cleaning Data/Year_2014/2014-229.txt
Clean text of length: 2579
Generated Clean_Data/Year_2014/2014-229.txt
Cleaning Data/Year_2014/2014-56.txt
Clean text of length: 942
Generated Clean_Data/Year_2014/2014-56.txt
Cleaning Data/Year_2014/2014-232.txt
Clean text of length: 2560
Generated Clean_Data/Year_2014/2014-232.txt
Cleaning Data/Year_2014/2014-125.txt
Clean text of length: 3106
Generated Clean_Data/Year_2014/

Clean text of length: 1409
Generated Clean_Data/Year_2014/2014-169.txt
Cleaning Data/Year_2014/2014-240.txt
Clean text of length: 1652
Generated Clean_Data/Year_2014/2014-240.txt
Cleaning Data/Year_2014/2014-191.txt
Clean text of length: 2119
Generated Clean_Data/Year_2014/2014-191.txt
Cleaning Data/Year_2014/2014-94.txt
Clean text of length: 1493
Generated Clean_Data/Year_2014/2014-94.txt
Cleaning Data/Year_2014/2014-58.txt
Clean text of length: 1052
Generated Clean_Data/Year_2014/2014-58.txt
Cleaning Data/Year_2014/2014-256.txt
Clean text of length: 2442
Generated Clean_Data/Year_2014/2014-256.txt
Cleaning Data/Year_2014/2014-177.txt
Clean text of length: 5282
Generated Clean_Data/Year_2014/2014-177.txt
Cleaning Data/Year_2014/2014-160.txt
Clean text of length: 1895
Generated Clean_Data/Year_2014/2014-160.txt
Cleaning Data/Year_2014/2014-271.txt
Clean text of length: 2609
Generated Clean_Data/Year_2014/2014-271.txt
Cleaning Data/Year_2014/2014-52.txt
Clean text of length: 1976
Genera

Clean text of length: 2088
Generated Clean_Data/Year_2017/2017-123.txt
Cleaning Data/Year_2017/2017-14.txt
Clean text of length: 1182
Generated Clean_Data/Year_2017/2017-14.txt
Cleaning Data/Year_2017/2017-173.txt
Clean text of length: 1108
Generated Clean_Data/Year_2017/2017-173.txt
Cleaning Data/Year_2017/2017-90.txt
Clean text of length: 1134
Generated Clean_Data/Year_2017/2017-90.txt
Cleaning Data/Year_2017/2017-169.txt
Clean text of length: 585
Generated Clean_Data/Year_2017/2017-169.txt
Cleaning Data/Year_2017/2017-165.txt
Clean text of length: 1321
Generated Clean_Data/Year_2017/2017-165.txt
Cleaning Data/Year_2017/2017-25.txt
Clean text of length: 1000
Generated Clean_Data/Year_2017/2017-25.txt
Cleaning Data/Year_2017/2017-130.txt
Clean text of length: 1192
Generated Clean_Data/Year_2017/2017-130.txt
Cleaning Data/Year_2017/2017-214.txt
Clean text of length: 875
Generated Clean_Data/Year_2017/2017-214.txt
Cleaning Data/Year_2017/2017-8.txt
Clean text of length: 1108
Generated C

Generated Clean_Data/Year_2017/2017-200-0.txt
Cleaning Data/Year_2017/2017-204.txt
Clean text of length: 1183
Generated Clean_Data/Year_2017/2017-204.txt
Cleaning Data/Year_2017/2017-91.txt
Clean text of length: 955
Generated Clean_Data/Year_2017/2017-91.txt
Cleaning Data/Year_2017/2017-220.txt
Clean text of length: 1416
Generated Clean_Data/Year_2017/2017-220.txt
Cleaning Data/Year_2017/2017-31.txt
Clean text of length: 1228
Generated Clean_Data/Year_2017/2017-31.txt
Cleaning Data/Year_2017/2017-5.txt
Clean text of length: 2073
Generated Clean_Data/Year_2017/2017-5.txt
Cleaning Data/Year_2017/2017-223.txt
Clean text of length: 1521
Generated Clean_Data/Year_2017/2017-223.txt
Cleaning Data/Year_2017/2017-29.txt
Clean text of length: 1661
Generated Clean_Data/Year_2017/2017-29.txt
Cleaning Data/Year_2017/2017-80.txt
Clean text of length: 1471
Generated Clean_Data/Year_2017/2017-80.txt
Cleaning Data/Year_2017/2017-53.txt
Clean text of length: 803
Generated Clean_Data/Year_2017/2017-53.tx

Clean text of length: 1375
Generated Clean_Data/Year_2017/2017-219.txt
Cleaning Data/Year_2017/2017-102.txt
Clean text of length: 1197
Generated Clean_Data/Year_2017/2017-102.txt
Cleaning Data/Year_2017/2017-148.txt
Clean text of length: 2137
Generated Clean_Data/Year_2017/2017-148.txt
Cleaning Data/Year_2017/2017-46.txt
Clean text of length: 1066
Generated Clean_Data/Year_2017/2017-46.txt
Cleaning Data/Year_2017/2017-24.txt
Clean text of length: 2206
Generated Clean_Data/Year_2017/2017-24.txt
Cleaning Data/Year_2017/2017-78.txt
Clean text of length: 2357
Generated Clean_Data/Year_2017/2017-78.txt
Cleaning Data/Year_2017/2017-135.txt
Clean text of length: 2031
Generated Clean_Data/Year_2017/2017-135.txt
Cleaning Data/Year_2017/2017-152.txt
Clean text of length: 789
Generated Clean_Data/Year_2017/2017-152.txt
Cleaning Data/Year_2017/2017-1.txt
Clean text of length: 1186
Generated Clean_Data/Year_2017/2017-1.txt
Cleaning Data/Year_2017/2017-199.txt
Clean text of length: 1048
Generated Cl

In [29]:
labels = []
#fraud_words = set(["fraud", "misleading", "misled", "litigation"])

for title in df["Title"]:
    if "fraud" in title.lower():
        labels.append(1)
    else:
        labels.append(0)

df.insert(5, "Fraud", labels, True)

In [30]:
df.head()

Unnamed: 0,Title,Article_ID,Date_Place,Text,Clean_Text,Fraud
0,SEC.gov | SEC Charges Swedish National with Gl...,2020-232,"Washington D.C., Sept. 29, 2020 —",today charged swedish national living conducti...,"['today', 'charged', 'swedish', 'national', 'l...",1
1,"SEC.gov | SEC Obtains Emergency Asset Freeze, ...",2020-10,"Washington D.C., Jan. 14, 2020 —",today announced filed emergency enforcement ac...,"['today', 'announced', 'filed', 'emergency', '...",0
2,SEC.gov | Chairman Jay Clayton Announces Addit...,2020-108,"Washington D.C., May 11, 2020 —",today released updated roster executive staff ...,"['today', 'released', 'updated', 'roster', 'ex...",0
3,SEC.gov | SEC Charges Issuer and CEO With Misr...,2020-181,"Washington D.C., Aug. 13, 2020 —",today announced charge virginiabased chief exe...,"['today', 'announced', 'charge', 'virginiabase...",1
4,SEC.gov | Small Business Capital Formation Adv...,2020-99,"Washington D.C., April 28, 2020 —",today released agenda 8 meeting hosted via vid...,"['today', 'released', 'agenda', '8', 'meeting'...",0


In [31]:
clean_texts = list()

for text_clean in df["Clean_Text"]:
    text_clean = eval(text_clean)
    clean_texts.append(text_clean)

In [32]:
dictionary = Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in clean_texts]
tfidf = models.TfidfModel(BoW_corpus, smartirs='ntc')
tfidf_corpus = tfidf[BoW_corpus]

In [33]:
lda_model_tfidf = models.LdaMulticore(tfidf_corpus, num_topics=5, id2word=dictionary, passes=2, workers=4)
lda_model_tfidf.save("lda.model")

In [34]:
print("Perplexity Score:", lda_model_tfidf.log_perplexity(tfidf_corpus))

Perplexity Score: -10.02292939362171


In [35]:
from pyLDAvis import gensim_models
import pyLDAvis

lda_display = gensim_models.prepare(lda_model_tfidf, BoW_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [36]:
top_list = []
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
                top_list.append(topic_num)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df, top_list

In [37]:
contents = pd.DataFrame({'Original text': clean_texts})
t_df, toplist = get_topic_details(lda_model_tfidf, BoW_corpus)
topic_details = pd.concat([t_df, contents], axis=1)

# Create flag for text highest associated with topic 3
topic_details['flag'] = np.where((topic_details['Dominant_Topic'] == 4.0), 1, 0)
print(topic_details.head())

   Dominant_Topic   % Score  \
0             4.0  0.824884   
1             4.0  0.995902   
2             3.0  0.994467   
3             1.0  0.499016   
4             2.0  0.716019   

                                       Original text  flag  
0  [today, charged, swedish, national, living, co...     1  
1  [today, announced, filed, emergency, enforceme...     1  
2  [today, released, updated, roster, executive, ...     0  
3  [today, announced, charge, virginiabased, chie...     0  
4  [today, released, agenda, 8, meeting, hosted, ...     0  


In [38]:
from sklearn.metrics import classification_report, confusion_matrix, r2_score, roc_auc_score 

y_true = df['Fraud']
y_predict = topic_details['flag']

print(f"Roc_Auc: {roc_auc_score(y_true, y_predict):0.3f}\n")

print('Classifcation report:\n', classification_report(y_true, y_predict))

class_names = np.array(['Non-Fraud', 'Fraud'])
#myML_functions.plot_confusion_matrix(y_true, y_predict, classes=class_names)
#plt.show()

Roc_Auc: 0.683

Classifcation report:
               precision    recall  f1-score   support

           0       0.91      0.84      0.88      2249
           1       0.36      0.52      0.43       390

    accuracy                           0.79      2639
   macro avg       0.64      0.68      0.65      2639
weighted avg       0.83      0.79      0.81      2639



In [39]:
from sklearn.metrics import f1_score

In [40]:
f1_Score = f1_score(y_true, y_predict)
print("F1 Score:", f1_Score)

F1 Score: 0.4299262381454162
