# Importing Libraries

In [1]:
#importing libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aastha78\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading the Data

In [3]:
data = pd.read_csv('final_data.csv')

In [4]:
text = data.iloc[:,-1]
text.head()

0    Agency move court invoke UAPA Suspecting accus...
1    Police found around 40 gram marijuana house Th...
2    MUMBAI The Narcotics Control Bureau NCB last t...
3    âTraffickersâ statement recordedâ Tollywood dr...
4    MUMBAI A Mumbai court Wednesday remanded Bolly...
Name: sentences, dtype: object

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentences
0,0,Agency move court invoke UAPA Suspecting accus...
1,1,Police found around 40 gram marijuana house Th...
2,2,MUMBAI The Narcotics Control Bureau NCB last t...
3,3,âTraffickersâ statement recordedâ Tollywood dr...
4,4,MUMBAI A Mumbai court Wednesday remanded Bolly...


In [6]:
# x_train,x_test = train_test_split(text,test_size=0.6,random_state=0)

# Data PreProcessing

In [7]:
#Removing Punctuations
def remove_punctuations(text):
    punctuations = '''<>/?:;"'{[-)(*&^%$#@!~`]}'"'''
    no_punct = ''
    for word in text:
        if(word not in punctuations):
            no_punct = no_punct+word
    return no_punct




In [8]:
data['sentences'].apply(remove_punctuations)

0     Agency move court invoke UAPA Suspecting accus...
1     Police found around 40 gram marijuana house Th...
2     MUMBAI The Narcotics Control Bureau NCB last t...
3     âTraffickersâ statement recordedâ Tollywood dr...
4     MUMBAI A Mumbai court Wednesday remanded Bolly...
                            ...                        
95    GUWAHATI Police recovered heroin weighing arou...
96    Mohali Police claimed arrested two habitual sn...
97    Bengaluru Central Crime Branch CCB police seiz...
98    Three narcotic smuggler arrested AntiNarcotics...
99    Hyderabad September 3 Actress Rakul Preet Sing...
Name: sentences, Length: 100, dtype: object

In [9]:
#Tokenization
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aastha78\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
#removing stopwords
from nltk.corpus import stopwords

In [11]:
stop_words = stopwords.words('english')

In [12]:
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    words = [w for w in words if not w in stop_words]
    return words

In [13]:
data = data['sentences'].apply(remove_stopwords)

In [14]:
data.to_csv('filtered_data2')

In [15]:
text = pd.read_csv('filtered_data2')
text.head()

Unnamed: 0.1,Unnamed: 0,sentences
0,0,"['Agency', 'move', 'court', 'invoke', 'UAPA', ..."
1,1,"['Police', 'found', 'around', '40', 'gram', 'm..."
2,2,"['MUMBAI', 'The', 'Narcotics', 'Control', 'Bur..."
3,3,"['âTraffickersâ', 'statement', 'recordedâ', 'T..."
4,4,"['MUMBAI', 'A', 'Mumbai', 'court', 'Wednesday'..."


In [16]:
#stemming
def stemmer(text):
    stemmer = PorterStemmer()
    text2 = ""
    for word in text:
        word = stemmer.stem(word)
        text2 = text2+word
    return text2
        
        

In [17]:
text['sentences'].apply(stemmer)

0     ['Agency', 'move', 'court', 'invoke', 'UAPA', ...
1     ['Police', 'found', 'around', '40', 'gram', 'm...
2     ['MUMBAI', 'The', 'Narcotics', 'Control', 'Bur...
3     ['âTraffickersâ', 'statement', 'recordedâ', 'T...
4     ['MUMBAI', 'A', 'Mumbai', 'court', 'Wednesday'...
                            ...                        
95    ['GUWAHATI', 'Police', 'recovered', 'heroin', ...
96    ['Mohali', 'Police', 'claimed', 'arrested', 't...
97    ['Bengaluru', 'Central', 'Crime', 'Branch', 'C...
98    ['Three', 'narcotic', 'smuggler', 'arrested', ...
99    ['Hyderabad', 'September', '3', 'Actress', 'Ra...
Name: sentences, Length: 100, dtype: object

In [18]:
x_train,x_test = train_test_split(text,test_size=0.6,random_state=0)

In [19]:
x_train.head()

Unnamed: 0.1,Unnamed: 0,sentences
10,10,"['Chandigarh', 'Punjab', 'Congress', 'presiden..."
31,31,"['Bijnor', 'A', '30yearold', 'man', 'friend', ..."
66,66,"['Patiala', 'Police', 'arrested', 'five', 'men..."
57,57,"['cannabis', 'smuggling', 'kitkat', 'chocolate..."
79,79,"['The', 'Enforcement', 'Directorate', 'ED', 'a..."


# Tf-Idf Vectorizer

In [20]:
# def tokenize(text):
#     tokens = [w for w in nltk.tokenize(text) if len(w)>3 and len(w.strip('Xx/')>2)]
#     return tokens

In [21]:
cv = TfidfVectorizer()

In [24]:
df = cv.fit_transform(x_train.sentences)


In [29]:
a = pd.DataFrame(df)

In [31]:
a.count()

0    40
dtype: int64

In [32]:
cv.get_feature_names()

['10',
 '100',
 '1000',
 '101',
 '101kg',
 '1030',
 '11',
 '114000',
 '1160kg',
 '12000',
 '120b',
 '13',
 '136',
 '14',
 '144th',
 '14day',
 '15',
 '150',
 '15000',
 '16',
 '161',
 '17',
 '18',
 '180',
 '18b',
 '18heera',
 '19',
 '1959',
 '1993',
 '1997',
 '1aa',
 '20',
 '20000',
 '2002',
 '201',
 '201112',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '21',
 '212',
 '2125',
 '215',
 '22',
 '22000',
 '23',
 '230',
 '233',
 '24',
 '240000',
 '25',
 '251b',
 '26',
 '27',
 '28',
 '29',
 '30',
 '30000',
 '302',
 '30day',
 '30yearold',
 '31',
 '35',
 '38',
 '39',
 '40',
 '400',
 '4000',
 '40yearold',
 '45',
 '45000',
 '48',
 '48kg',
 '49yearold',
 '4km',
 '50',
 '500',
 '50000',
 '50000police',
 '54',
 '55yearold',
 '570',
 '59',
 '5kg',
 '6000crore',
 '61',
 '611793',
 '76800',
 '85',
 '87kg',
 '90',
 '90250',
 '96',
 'absconding',
 'access',
 'accident',
 'accommodate',
 'accomplice',
 'according',
 'account',
 'accountant',
 'accused',
 'across',
 'act',
 'acted',
 'acting',
 'action',


# Training The Model

In [44]:
lda = decomposition.LatentDirichletAllocation(n_components=20,max_iter=3,learning_method='online',learning_offset=50,n_jobs=-1,random_state=111)

In [45]:
w1 = lda.fit_transform(df)


In [46]:
h1 = lda.components_

## Picking Top words for each Topics

In [47]:
num_words = 15
vocab = np.array(cv.get_feature_names())

In [48]:
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in h1])
topics = [' '.join(t) for t in topic_words]

In [57]:
topics

['handling surprised murder 1aa canada devanahalli hired lakhsawan accident arey 16 thomas currently thank popular',
 'include summoned 18 kamendra observation aggregator worked cinema during on motorcycle raj smoking cbic still',
 'rame resident drug 1000 ed trying nigerians police suspicious psychotropic thursday network sushant watch 15',
 'conducted vip it produced bail centre primarily nagar bassi originated unidentified boss identify gogian end',
 'madhya hoarding prem residence van the 1997 balaj singh plot originated disapproval identified savitri 76800',
 'living raj dealer tira airport serveall menace held checkpoint status utilised sushant 20 action one',
 'andro 114000 90 april prime attempt ahmad affidavit 101kg fir scheduled bassi deposit ipc efforts',
 'development mansukh edited brought intelligence chennai dealer bomb transfer south officers inaction vaccine drugpeddling wanted',
 'peddling citizen three zero remaining detail used involved scanner light presented impri

In [50]:
topic_words

[['handling',
  'surprised',
  'murder',
  '1aa',
  'canada',
  'devanahalli',
  'hired',
  'lakhsawan',
  'accident',
  'arey',
  '16',
  'thomas',
  'currently',
  'thank',
  'popular'],
 ['include',
  'summoned',
  '18',
  'kamendra',
  'observation',
  'aggregator',
  'worked',
  'cinema',
  'during',
  'on',
  'motorcycle',
  'raj',
  'smoking',
  'cbic',
  'still'],
 ['rame',
  'resident',
  'drug',
  '1000',
  'ed',
  'trying',
  'nigerians',
  'police',
  'suspicious',
  'psychotropic',
  'thursday',
  'network',
  'sushant',
  'watch',
  '15'],
 ['conducted',
  'vip',
  'it',
  'produced',
  'bail',
  'centre',
  'primarily',
  'nagar',
  'bassi',
  'originated',
  'unidentified',
  'boss',
  'identify',
  'gogian',
  'end'],
 ['madhya',
  'hoarding',
  'prem',
  'residence',
  'van',
  'the',
  '1997',
  'balaj',
  'singh',
  'plot',
  'originated',
  'disapproval',
  'identified',
  'savitri',
  '76800'],
 ['living',
  'raj',
  'dealer',
  'tira',
  'airport',
  'serveall',


In [59]:
topic_list = pd.DataFrame(topic_words)

In [60]:
topic_list

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,handling,surprised,murder,1aa,canada,devanahalli,hired,lakhsawan,accident,arey,16,thomas,currently,thank,popular
1,include,summoned,18,kamendra,observation,aggregator,worked,cinema,during,on,motorcycle,raj,smoking,cbic,still
2,rame,resident,drug,1000,ed,trying,nigerians,police,suspicious,psychotropic,thursday,network,sushant,watch,15
3,conducted,vip,it,produced,bail,centre,primarily,nagar,bassi,originated,unidentified,boss,identify,gogian,end
4,madhya,hoarding,prem,residence,van,the,1997,balaj,singh,plot,originated,disapproval,identified,savitri,76800
5,living,raj,dealer,tira,airport,serveall,menace,held,checkpoint,status,utilised,sushant,20,action,one
6,andro,114000,90,april,prime,attempt,ahmad,affidavit,101kg,fir,scheduled,bassi,deposit,ipc,efforts
7,development,mansukh,edited,brought,intelligence,chennai,dealer,bomb,transfer,south,officers,inaction,vaccine,drugpeddling,wanted
8,peddling,citizen,three,zero,remaining,detail,used,involved,scanner,light,presented,imprisonment,for,hiren,edited
9,special,encounter,maharashtra,the,net,gang,suresh,premium,all,55yearold,hear,bengaluru,flavour,gholumajra,courtthe


In [54]:
colnames = ["Topics"+ str(i) for i in range(lda.n_components)]
docnames = ["Doc" + str(i) for i in range(len(x_train.sentences))]
df_doc_topics = pd.DataFrame(np.round(w1,2), columns = colnames,index = docnames)


In [55]:
significant_topic = np.argmax(df_doc_topics.values,axis=1)
df_doc_topics['dominant_topic'] = significant_topic

In [56]:
df_doc_topics

Unnamed: 0,Topics0,Topics1,Topics2,Topics3,Topics4,Topics5,Topics6,Topics7,Topics8,Topics9,...,Topics11,Topics12,Topics13,Topics14,Topics15,Topics16,Topics17,Topics18,Topics19,dominant_topic
Doc0,0.0,0.0,0.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Doc1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.9,0.01,0.01,0.01,0.01,0.01,14
Doc2,0.0,0.0,0.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Doc3,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.79,19
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
Doc5,0.0,0.0,0.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Doc6,0.0,0.0,0.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Doc7,0.01,0.01,0.9,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,2
Doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.91,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
Doc9,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.89,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,9
