In [108]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
from sklearn.pipeline import Pipeline
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
nlp=spacy.load('en_core_web_sm')
doc=nlp('We just opened our wings, the flying part is coming soon')
for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [3]:
def preprocess(text):
    doc=nlp(text)
    no_stop_words=[token.text for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(no_stop_words)

In [4]:
preprocess('We just opened our wings, the flying part is coming soon')

'opened wings flying coming soon'

In [5]:
preprocess('The other is not other but your divine brother')

'divine brother'

In [6]:
df=pd.read_json('combined.json',lines=True)

In [7]:
df.shape

(13087, 6)

In [8]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [9]:
df.topics[0]

[]

In [10]:
type(df.topics[0])

list

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13087 entries, 0 to 13086
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12810 non-null  object
 1   title       13087 non-null  object
 2   contents    13087 non-null  object
 3   date        13087 non-null  object
 4   topics      13087 non-null  object
 5   components  13087 non-null  object
dtypes: object(6)
memory usage: 613.6+ KB


In [12]:
df.describe()

Unnamed: 0,id,title,contents,date,topics,components
count,12810,13087,13087,13087,13087,13087
unique,12672,12887,13080,2400,253,810
top,13-526,Northern California Real Estate Investor Agree...,"WASHINGTON – ING Bank N.V., a financial inst...",2018-04-13T00:00:00-04:00,[],[Criminal Division]
freq,3,8,2,20,8399,2680


In [13]:
df=df[df['topics'].str.len()!=0]

In [14]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [15]:
df.shape

(4688, 6)

In [16]:
df['contents']

4        The U.S. Department of Justice, the U.S. Envir...
7        A 131-count criminal indictment was unsealed t...
19       The United States Attorney’s Office for the Mi...
22       21st Century Oncology LLC, has agreed to pay $...
23       21st Century Oncology Inc. and certain of its ...
                               ...                        
13081    Anthony Merrell Tyler, 34, of Yuba City, Calif...
13082    The Department of Justice and the U.S. Environ...
13084    Subsidiary Agrees to Plead Guilty to Violating...
13085    ZTE Corporation has agreed to enter a guilty p...
13086    ZTE Corporation pleaded guilty today to conspi...
Name: contents, Length: 4688, dtype: object

In [17]:
df['contents'].iloc[4]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations regarding the company’s use of electronic health records software and separate allegations that they violated the False Claims Act by submitting, or causing the submission of, claims for certain services provided pursuant to referrals from physicians with whom they had improper financial relationships. \xa0 “The Justice Department is committed to zealously investigating improper financial relationships that have the potential to compromise physicians’ medical judgment,” said Acting Assistant Attorney General Chad A. Readler of the Justice Department’s Civil Division.\xa0 “However, we will work with companies that accept responsibility for their past compliance failures and promptly take corrective action.”  \xa0 21st Century Oncology, which is headquartered in Fort Myers, Florida, owns a

In [18]:
len(df['contents'].iloc[4])

5504

In [19]:
df['contents_new']=df['contents'].apply(preprocess)

In [20]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unsealed today B...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agreed pay $ 19.75 m...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...


In [21]:
len(df['contents'].iloc[4])

5504

In [22]:
len(df['contents_new'].iloc[4])

4217

In [23]:
df['contents'].iloc[4][:300]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations regarding the company’s use of electronic health records software and separate allegations that the'

In [24]:
df['contents_new'].iloc[4][:300]

'21st Century Oncology Inc. certain subsidiaries affiliates agreed pay $ 26 million government resolve self disclosure relating submission false attestations company use electronic health records software separate allegations violated False Claims Act submitting causing submission claims certain serv'

In [26]:
sw=stopwords.words('english')

In [28]:
lemmatizer=WordNetLemmatizer()

In [29]:
def preprocessing(text):
    text=text.lower()
    text=re.sub('[^a-z]',' ',text)
    text=re.sub('\s+[a-z]\s+',' ',text)
    text=re.sub('\s+',' ',text)
    words=text.split()
    words=[word for word in words if word not in sw]
    words=[lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [30]:
df['contents_new']=df['contents_new'].apply(preprocessing)

In [31]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],u department justice environmental protection ...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],count criminal indictment unsealed today bosto...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",united state attorney office middle district g...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],st century oncology llc agreed pay million gov...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",st century oncology inc certain subsidiary aff...


In [32]:
df['contents'].iloc[4][:300]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations regarding the company’s use of electronic health records software and separate allegations that the'

In [33]:
df['contents_new'].iloc[4][:300]

'st century oncology inc certain subsidiary affiliate agreed pay million government resolve self disclosure relating submission false attestation company use electronic health record software separate allegation violated false claim act submitting causing submission claim certain service provided pur'

In [34]:
df['topics'].value_counts()

topics
[Tax]                                                                        706
[Consumer Protection]                                                        335
[Civil Rights]                                                               305
[Antitrust]                                                                  292
[Hate Crimes]                                                                246
                                                                            ... 
[Tax, Health Care Fraud]                                                       1
[Prescription Drugs, Consumer Protection, Health Care Fraud]                   1
[Civil Rights, Firearms Offenses]                                              1
[Antitrust, Financial Fraud, Securities, Commodities, & Investment Fraud]      1
[Hate Crimes, Civil Rights]                                                    1
Name: count, Length: 252, dtype: int64

In [73]:
df['topics'][4][0]

'Environment'

In [80]:
s=df['topics']

In [81]:
s

4                                            [Environment]
7                                    [Consumer Protection]
19                                           [Environment]
22                   [False Claims Act, Health Care Fraud]
23                   [Health Care Fraud, False Claims Act]
                               ...                        
13081                                        [Hate Crimes]
13082                                        [Environment]
13084                                 [Foreign Corruption]
13085    [Asset Forfeiture, Counterintelligence and Exp...
13086             [Counterintelligence and Export Control]
Name: topics, Length: 4688, dtype: object

In [90]:
label=[]
for i in s:
    label.append(i[0])

In [92]:
label[:10]

['Environment',
 'Consumer Protection',
 'Environment',
 'False Claims Act',
 'Health Care Fraud',
 'Consumer Protection',
 'Consumer Protection',
 'Opioids',
 'False Claims Act',
 'Drug Trafficking']

In [93]:
df['label']=label

In [94]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contents_new,label
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],u department justice environmental protection ...,Environment
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],count criminal indictment unsealed today bosto...,Consumer Protection
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",united state attorney office middle district g...,Environment
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],st century oncology llc agreed pay million gov...,False Claims Act
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",st century oncology inc certain subsidiary aff...,Health Care Fraud


In [95]:
le=LabelEncoder()
df['label']=le.fit_transform(df['label'])

In [96]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contents_new,label
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],u department justice environmental protection ...,13
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],count criminal indictment unsealed today bosto...,6
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",united state attorney office middle district g...,13
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],st century oncology llc agreed pay million gov...,14
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",st century oncology inc certain subsidiary aff...,20


In [98]:
X_train,X_test,y_train,y_test=train_test_split(df['contents_new'],df['label'],test_size=0.2,random_state=101)

In [99]:
X_train.shape

(3750,)

In [100]:
X_test.shape

(938,)

In [101]:
X_train

9378     jury sitting greensboro north carolina convict...
2974     federal court greenbelt maryland found marvin ...
9516     ohio man charged count indictment today allege...
683      washington attorney general eric holder today ...
4269     u district court judge marcia cooke sentenced ...
                               ...                        
11592    defendant prepared thousand false tax return f...
11791    grand jury miami florida indicted individual c...
1760     commonwealth pennsylvania pay million resolve ...
3879     jury convicted deputy jailer eastern kentucky ...
4515     president automotive part company pleaded guil...
Name: contents_new, Length: 3750, dtype: object

In [102]:
y_train

9378     39
2974     39
9516      9
683      33
4269      4
         ..
11592    39
11791     6
1760     14
3879      4
4515      1
Name: label, Length: 3750, dtype: int32

In [103]:
clf=Pipeline([
    ('cv',CountVectorizer()),
    ('nv',MultinomialNB())
])

In [104]:
clf.fit(X_train,y_train)

In [105]:
y_pred=clf.predict(X_test)

In [109]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.85      1.00      0.92        62
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.86      0.77      0.81        87
           5       0.00      0.00      0.00         0
           6       0.96      0.79      0.87        99
           7       0.95      0.67      0.78        27
           8       1.00      0.83      0.91        59
           9       0.67      0.86      0.75        14
          10       0.00      0.00      0.00         0
          11       0.42      0.56      0.48         9
          12       0.00      0.00      0.00         0
          13       0.90      0.90      0.90        48
          14       0.97      0.91      0.94        34
          15       0.61      0.48      0.54        23
          16       0.00      0.00      0.00         0
          17       0.14    

In [110]:
accuracy_score(y_pred,y_test)

0.8166311300639659