# Installing the following Libraries


In [32]:
!pip install polyglot
!pip install pyicu
!pip install pycld2
!pip install morfessor
!pip install wordcloud



Importing process 

In [33]:
import re
import numpy as np
import pandas as pd
import folium
import nltk
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import polyglot

from os import path
from itertools import cycle
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords as sw
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from collections import Counter
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
from polyglot.text import Text
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

Downloading nltk files

In [34]:
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

!polyglot download embeddings2.en
!polyglot download ner2.en

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /root/polyglot_data...
[polyglot_data]   Package embeddings2.en is already up-to-date!
[polyglot_data] Downloading package ner2.en 

# Data Preprocessing

In [35]:
def category_desc_to_txt(df, category_list):
  
  #Collect all the descriptions in a text file, one per category(11)
  
  text = ['']*len(category_list)
  df_filtered_list = []
  counter = 0

  for category in category_list:
    df_filtered_list.append(df_train[df_train['CATEGORY'].str.contains(category)==True])

  for index in range(len(df_filtered_list)):
    for description in df_filtered_list[index].DESCRIPTION:
      text[index] = text[index] + ' ' + str(description)
      counter+=1
      if counter == int(report_1.shape[0]*split_rate):
        break

  category_list_names = []
  bad_chars = ['/', ' '] 

  for category in category_list:
    for i in bad_chars:
      category = category.replace(i, '')
    category_list_names.append(category.replace(i, ''))   

  for category, description in zip(category_list_names, text):
    f = open(category+'.txt',"w+")
    f.write(description)
    
  return category_list_names

In [36]:
#Read dataset and remove NA values
report_1 = pd.read_csv('safecity_reports_28072019.csv')

#Check the number of rows without a location (value==NaN)
print('Shape of df : ', report_1.shape)
print('\nDropping rows without description or location or category.....\n')
report_1 = report_1[pd.notna(report_1['DESCRIPTION'])]
report_1 = report_1[pd.notna(report_1['LONGITUDE'])]
report_1 = report_1[pd.notna(report_1['CATEGORY'])]
print('New shape of df : ', report_1.shape)

categories=[]
for index, row in report_1.iterrows():
    cat_group_list = [i for i in row.CATEGORY.split(',')]
    del(cat_group_list[-1])
    for category in cat_group_list:
      category = [category.lstrip()]
      if category not in categories:
        categories.append(category) #add a new category
      else:
        next
    #replace in the original df with the list. iat[] is to set a single row and 5 is the CATEGORY column index    
    if index == 11604:
      break
    report_1.iat[index, 5] = cat_group_list
    
print("Total number of unique categories : {}\n".format(len(categories)))
counts = [0] * len(categories)
category_list = [cat[0] for cat in categories]

#Split in train and test and create a collection of descriptions for each category
split_rate = 0.8
df_train = report_1.iloc[0:int(report_1.shape[0]*split_rate), :].copy()
df_test = report_1.iloc[int(report_1.shape[0]*split_rate)+1:,: ].copy()
fileNames = category_desc_to_txt(df_train, category_list) 

Shape of df :  (12122, 9)

Dropping rows without description or location or category.....

New shape of df :  (11604, 9)
Total number of unique categories : 14



# Tokenization,Lemmization and stemming Process

In [37]:
def top20_and_allTokens(filename, stemming=True, lemmatization=True, plot=True):
  
  lemmatizer = WordNetLemmatizer()
  stemmer = PorterStemmer()
  category = open(filename, 'r')
  content = category.read().lower()
  pattern_wd_eng = (r'[A-Za-z]+') 
  
  print('\n', filename)
  
  #Tokenization
  tokens = re.findall(pattern_wd_eng, content)
  no_stops = [t for t in tqdm(tokens) if t not in sw.words('english')]
  
  print('Words and digits eng :\n', re.findall(pattern_wd_eng, content))
  print('\nSet() unique words and digits eng :\n', set(re.findall(pattern_wd_eng, content)))
  print('\nAfter nostop filter :\n', no_stops)
  
  top_20_a = Counter(no_stops).most_common(20)
  print('\nTop 20 most used words after non_stop:\n', top_20_a)
  
  #Lemmatization
  lemmatized = [lemmatizer.lemmatize(word, pos="v") for word in no_stops]
  print('\nAfter lemmatizer :\n', lemmatized) if lemmatization else None
  top_20_b = Counter(lemmatized).most_common(20)
  print('\nTop 20 most used words after lemmatizer:\n', top_20_b) if lemmatization else None
  
  #Stemming
  stemmed = [stemmer.stem(word) for word in lemmatized]
  print('\nAfter stemmer :\n', stemmed) if stemming else None
  top_20 = Counter(stemmed).most_common(20)
  print('\nTop 20 most used words after stemmer:\n', top_20) if stemming else None
  
  top20_token = [word[0] for word in top_20]
  top20_count = [word[1] for word in top_20]
  
  if plot:
    fig, ax = plt.subplots()
    fig.set_size_inches(20, 5)
    plt.bar(labels, values, color='g')
    plt.show()
    
  total_tokens = lemmatized if lemmatization else no_stops
  total_tokens = stemmed if stemming else total_tokens
  
  return total_tokens

Tf_idf

In [38]:
def top20_tf_idf(category_tokens):
  
  # Build a dicitionary with all the unique tokens merging all categories
  dictionary = Dictionary(category_tokens)

  # Corpus: List(7 items) of list(for each collection) of tuples of word_id and count
  corpus = [dictionary.doc2bow(collection) for collection in category_tokens]

  print('\ncorpus: {}'.format(len(corpus)))
  
  # Create a new TfidfModel using the corpus: tfidf
  tfidf = TfidfModel(corpus)

  print('Top_20 after Term Frequency–Inverse Document Frequency')  
  
  for index, tokens_and_Cnt in enumerate(corpus):
    print('\n', fileNames[index])
    
  # Calculate the tfidf weights of doc: tfidf_weights
    tfidf_weights = tfidf[tokens_and_Cnt]

  # Sort the weights from highest to lowest: sorted_tfidf_weights
    sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

  # Print the top 5 weighted words
    for term_id, weight in sorted_tfidf_weights[0:20]:
      print(dictionary.get(term_id), weight)
      
  return corpus, dictionary, tfidf

In [39]:
#First I want to pass the list in CATEGORY to a vector of 1s and 0s
categories_bool = [[]]
category_bool = [0]*len(categories)

for index, row in report_1.iterrows():
  row_category = row['CATEGORY']
  for category in row_category:
    index_match = list(filter(lambda x: categories[x][0] == category.lstrip(), range(len(categories))))
    if len(index_match) == 1:
      category_bool[index_match[0]] = 1 if len(index_match) != 0 else None
  categories_bool.append(category_bool)
  category_bool = [0]*len(categories)
del(categories_bool[0])

#Expand the column CATEGORY by its boolean representation
category_list = [cat[0] for cat in categories]
df_categories = pd.DataFrame.from_records(categories_bool)
df_categories.columns = category_list
df_ready = pd.concat([report_1, df_categories], axis=1)

In [40]:
df_ready.head()

Unnamed: 0,#,INCIDENT TITLE,INCIDENT DATE,LOCATION,DESCRIPTION,CATEGORY,LATITUDE,LONGITUDE,More Info,Touching /Groping,Catcalls/Whistles,Sexual Invites,Stalking,Others,Commenting,Rape / Sexual Assault,North East India Report,Indecent exposure,Chain Snatching,Ogling/Facial Expressions/Staring,Taking pictures,Poor / No Street Lighting,Online Harassment
0,12152.0,touching,27-05-2019 15:00,"Chanakya Puri, Danapur Nizamat, Patna, Bihar 8...",My school auto driver always use to stare at m...,[Touching /Groping],25.625485,85.055803,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12149.0,whistling,17-05-2019 19:00,"Near Nissan Showroom, Bailey Rd, Patna, Bihar ...",I was in market with my mom. There I saw a man...,[Catcalls/Whistles],25.619769,85.045596,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12213.0,sexual invite,15-05-2019 15:03,"Makina, Nairobi, Kenya",there is a girl whom boys have been writing a ...,[Sexual Invites],-1.308482,36.789595,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12199.0,sexual invite,14-05-2019 14:13,"Makina, Nairobi, Kenya",there is a girl who is being disturbed y a bo...,[Sexual Invites],-1.308482,36.789595,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12206.0,sexual invite,13-05-2019 14:45,"Makina, Nairobi, Kenya",there are some boys who likes disturbing a gir...,[Sexual Invites],-1.308482,36.789595,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Loading the Data

In [41]:
#Load the data version
report_1 = pd.read_csv('safecity_reports_07082019.csv')
report_1 = report_1[pd.notna(report_1['DESCRIPTION'])]

x_report_1 = report_1[report_1.columns[4]]
y_report_1 = report_1[report_1.columns[14:28]]
X_train, X_test, y_train, y_test = train_test_split(x_report_1, y_report_1, test_size=0.2, random_state=17)
categories = y_train.columns

In [42]:
#CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
X_train = X_train[pd.notna(X_train)]
count_train = count_vectorizer.fit_transform(X_train.values)
X_test = X_test[pd.notna(X_test)]
count_test = count_vectorizer.transform(X_test.values)

#TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [43]:
print(X_train.values[0])
print(count_train[0])

two boys commented on me and my sister while we were returning home from the market
  (0, 1144)	1
  (0, 1687)	1
  (0, 7208)	1
  (0, 6563)	1
  (0, 3616)	1
  (0, 4839)	1


In [44]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

pattern_wd_eng = (r'[A-Za-z]+') 

lowers = [entry.lower() for entry in X_train.values] #1

def no_stopwords(dt_list, option=1):
  tokens = [re.findall(pattern_wd_eng, t) for t in dt_list] if option==1 else [word_tokenize(entry) for entry in dt_list]
  no_stops = [t for t in tokens if t not in sw.words('english')]
  return no_stops

In [45]:
no_stops = no_stopwords(lowers) 
alternative = no_stopwords(lowers, option=2)

In [46]:
lemm = []
stemm = []
for category in tqdm(range(len(no_stops))):
  lemmatized = [lemmatizer.lemmatize(word, pos="v") for word in no_stops[category]] #2
  lemm.append(lemmatized)
  stemmed = [stemmer.stem(word) for word in lemmatized] #3
  stemm.append(stemmed)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/8844 [00:00<?, ?it/s]

In [47]:
from sklearn.metrics import precision_recall_fscore_support
import joblib

category_list_save_model = ['TouchingGroping','CatcallsWhistles', 'SexualInvites', 'Stalking', 'Others', 'Commenting',
                 'RapeSexualAssault', 'NorthEastIndiaReport', 'IndecentExposure', 'ChainSnatching', 
                 'OglingFacialExpressionsStaring', 'TakingPictures', 'PoorNoStreetLighting', 'OnlineHarassment']

def execute(pipeline, X_train=X_train, X_test=X_test, confusion_matrix=False, verbose=True):

  accuracies=[]
  
  for category, cat_name in (tqdm(zip(categories, category_list_save_model)) if verbose else zip(categories, category_list_save_model)):
    pipeline.fit(X_train, y_train[category])
    prediction = pipeline.predict(X_test)
    
    if len(X_test) == 1:
      print('Prediction for {} is {}'.format(category, prediction)) if verbose else None
    else:
      print('Test accuracy for {} is {}'.format(category, metrics.accuracy_score(y_test[category], prediction))) if verbose else None
      accuracies.append(metrics.accuracy_score(y_test[category], prediction))
      print(metrics.confusion_matrix(y_test[category], prediction, labels=[0,1])) if confusion_matrix else None
      print('precision_recall_fscore_support_weighted', precision_recall_fscore_support(y_test[category], prediction, average='weighted')) if verbose else None

    filename = 'model_'+cat_name+'.sav'
    joblib.dump(pipeline, filename)
  print('mean: ', sum(accuracies)/len(accuracies)) if verbose and len(accuracies)!=0 else None
  return accuracies

def execute_mean(pipeline, X_train=X_train, X_test=X_test, confusion_matrix=False, verbose=True):

  accuracies=[]
  for category in (tqdm(categories) if verbose else categories):
    pipeline.fit(X_train, y_train[category])
    prediction = pipeline.predict(X_test)
    
    if len(X_test) == 1:
      print('Prediction for {} is {}'.format(category, prediction)) if verbose else None
    else:
      print('Test accuracy for {} is {}'.format(category, metrics.accuracy_score(y_test[category], prediction))) if verbose else None
      accuracies.append(metrics.accuracy_score(y_test[category], prediction))
      print(metrics.confusion_matrix(y_test[category], prediction, labels=[0,1])) if confusion_matrix else None
      print('precision_recall_fscore_support_weighted', precision_recall_fscore_support(y_test[category], prediction, average='weighted')) if verbose else None
  
  mean = sum(accuracies)/len(accuracies)
  print('mean: ', mean) if verbose else None
  return accuracies, mean

In [48]:
print(count_train.shape)
print(y_train.shape)
y_train.head()

(8844, 8894)
(8844, 14)


Unnamed: 0,Touching /Groping,Catcalls/Whistles,Sexual Invites,Stalking,Others,Commenting,Rape / Sexual Assault,North East India Report,Indecent Exposure/Masturbation in public,Chain Snatching,Ogling/Facial Expressions/Staring,Taking pictures,Poor / No Street Lighting,Online Harassment
3742,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2876,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7655,0,1,0,0,1,1,0,0,0,0,1,0,0,0
6048,0,0,0,1,0,1,0,0,0,0,0,0,1,0
4403,0,0,0,0,0,0,1,0,0,0,0,0,0,0


# Multinomial Naive Bayes algorithm

In [49]:
#Initial test for one category, CountVectorizer()
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train.iloc[:,0])
pred = nb_classifier.predict(count_test)
score = metrics.accuracy_score(y_test.iloc[:,0], pred)
print('Test accuracy is {}'.format(score))
metrics.confusion_matrix(y_test.iloc[:,0], pred)

Test accuracy is 0.8611488014473089


array([[1497,   80],
       [ 227,  407]])

In [50]:
#Initial test for one category, TfidfVectorizer()
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train, y_train.iloc[:,0])
pred = nb_classifier.predict(tfidf_test)
score = metrics.accuracy_score(y_test.iloc[:,0], pred)
print('Test accuracy is {}'.format(score))
metrics.confusion_matrix(y_test.iloc[:,0], pred, labels=[0,1])

Test accuracy is 0.80958842152872


array([[1552,   25],
       [ 396,  238]])

In [51]:
#Inspecting the model
# Get the class labels: class_labels
class_labels = nb_classifier.classes_
# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()
# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))
print(class_labels, feat_with_weights[300:350])

[0 1] [(-9.687577285651056, 'aren'), (-9.687577285651056, 'arena'), (-9.687577285651056, 'ares'), (-9.687577285651056, 'arested'), (-9.687577285651056, 'arguing'), (-9.687577285651056, 'arival'), (-9.687577285651056, 'armbushed'), (-9.687577285651056, 'armed'), (-9.687577285651056, 'army'), (-9.687577285651056, 'arre'), (-9.687577285651056, 'arrest'), (-9.687577285651056, 'arrested'), (-9.687577285651056, 'arrival'), (-9.687577285651056, 'arrives'), (-9.687577285651056, 'arriving'), (-9.687577285651056, 'arterial'), (-9.687577285651056, 'article'), (-9.687577285651056, 'artid'), (-9.687577285651056, 'artificial'), (-9.687577285651056, 'arts'), (-9.687577285651056, 'aruvathimuvar'), (-9.687577285651056, 'asgarali'), (-9.687577285651056, 'ashok'), (-9.687577285651056, 'asians'), (-9.687577285651056, 'aside'), (-9.687577285651056, 'asif'), (-9.687577285651056, 'aso'), (-9.687577285651056, 'aspx'), (-9.687577285651056, 'assassinating'), (-9.687577285651056, 'assaulter'), (-9.68757728565105

In [52]:
#Pipelines
NB_pipeline_CountV = Pipeline([('count', CountVectorizer(stop_words='english')), 
                        ('clf', OneVsRestClassifier(MultinomialNB())),])
NB_pipeline_TfidfV = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                        ('clf', OneVsRestClassifier(MultinomialNB())),])
NB_pipeline_CountV2 = Pipeline([('count2', CountVectorizer(stop_words='english')), 
                        ('clf', MultiOutputClassifier(MultinomialNB())),]) #do tuning for this one as well

In [53]:
#For OneVsRestClasifier strategy
from sklearn.metrics import precision_recall_fscore_support
import joblib

category_list_save_model = ['TouchingGroping','CatcallsWhistles', 'SexualInvites', 'Stalking', 'Others', 'Commenting',
                 'RapeSexualAssault', 'NorthEastIndiaReport', 'IndecentExposure', 'ChainSnatching', 
                 'OglingFacialExpressionsStaring', 'TakingPictures', 'PoorNoStreetLighting', 'OnlineHarassment']

def execute(pipeline, X_train=X_train, X_test=X_test, confusion_matrix=False, verbose=True):

  accuracies=[]
  
  for category, cat_name in (tqdm(zip(categories, category_list_save_model)) if verbose else zip(categories, category_list_save_model)):
    pipeline.fit(X_train, y_train[category])
    prediction = pipeline.predict(X_test)
    
    if len(X_test) == 1:
      print('Prediction for {} is {}'.format(category, prediction)) if verbose else None
    else:
      print('Test accuracy for {} is {}'.format(category, metrics.accuracy_score(y_test[category], prediction))) if verbose else None
      accuracies.append(metrics.accuracy_score(y_test[category], prediction))
      print(metrics.confusion_matrix(y_test[category], prediction, labels=[0,1])) if confusion_matrix else None
      print('precision_recall_fscore_support_weighted', precision_recall_fscore_support(y_test[category], prediction, average='weighted')) if verbose else None

    filename = 'model_'+cat_name+'.sav'
    joblib.dump(pipeline, filename)
  print('mean: ', sum(accuracies)/len(accuracies)) if verbose and len(accuracies)!=0 else None
  return accuracies

def execute_mean(pipeline, X_train=X_train, X_test=X_test, confusion_matrix=False, verbose=True):

  accuracies=[]
  for category in (tqdm(categories) if verbose else categories):
    pipeline.fit(X_train, y_train[category])
    prediction = pipeline.predict(X_test)
    
    if len(X_test) == 1:
      print('Prediction for {} is {}'.format(category, prediction)) if verbose else None
    else:
      print('Test accuracy for {} is {}'.format(category, metrics.accuracy_score(y_test[category], prediction))) if verbose else None
      accuracies.append(metrics.accuracy_score(y_test[category], prediction))
      print(metrics.confusion_matrix(y_test[category], prediction, labels=[0,1])) if confusion_matrix else None
      print('precision_recall_fscore_support_weighted', precision_recall_fscore_support(y_test[category], prediction, average='weighted')) if verbose else None
  
  mean = sum(accuracies)/len(accuracies)
  print('mean: ', mean) if verbose else None
  return accuracies, mean

In [54]:
#For MultOutputClasifier strategy
def execute2(pipeline, X_train=X_train, X_test=X_test, confusion_matrix=False, verbose=True):
  
  accuracies=[]
  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)
      
  if confusion_matrix:
    y_test_list = y_test.values.tolist()
    y_test_array = np.array(y_test_list)
    y_pred_list = y_pred.tolist()
    y_pred_array = np.array(y_pred_list)
    conf_mat = metrics.multilabel_confusion_matrix(y_test_array, y_pred_array)
    print('Test accuracy is {}'.format(metrics.accuracy_score(y_test_array, y_pred_array)))
    print(conf_mat)
      
  return accuracies

In [55]:
acc = execute(NB_pipeline_CountV, confusion_matrix=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


0it [00:00, ?it/s]

Test accuracy for Touching /Groping is 0.8611488014473089
precision_recall_fscore_support_weighted (0.8589813317516827, 0.8611488014473089, 0.855136478107857, None)
Test accuracy for Catcalls/Whistles is 0.8132066938037087
precision_recall_fscore_support_weighted (0.7995591486149929, 0.8132066938037087, 0.8016860392788738, None)
Test accuracy for Sexual Invites is 0.8941655359565808
precision_recall_fscore_support_weighted (0.8645159629160971, 0.8941655359565808, 0.8690519930007954, None)
Test accuracy for Stalking is 0.9199457259158752
precision_recall_fscore_support_weighted (0.8983651218820121, 0.9199457259158752, 0.9006944884433291, None)
Test accuracy for Others is 0.8891904115784712
precision_recall_fscore_support_weighted (0.864232728825344, 0.8891904115784712, 0.8591009864459133, None)
Test accuracy for Commenting is 0.7815468113975577
precision_recall_fscore_support_weighted (0.7790976814997338, 0.7815468113975577, 0.7794759101031481, None)
Test accuracy for Rape / Sexual Assa

  _warn_prf(average, modifier, msg_start, len(result))


mean:  0.9114169412676875


In [56]:
acc = execute2(NB_pipeline_CountV2, confusion_matrix=True) #the sme onevsrest

Test accuracy is 0.30574400723654455
[[[1497   80]
  [ 227  407]]

 [[1559  133]
  [ 280  239]]

 [[1940   38]
  [ 196   37]]

 [[2002   27]
  [ 150   32]]

 [[1925   26]
  [ 219   41]]

 [[1166  206]
  [ 277  562]]

 [[2067   25]
  [  87   32]]

 [[2200    3]
  [   8    0]]

 [[2063    8]
  [ 120   20]]

 [[2116   11]
  [  36   48]]

 [[1653   81]
  [ 311  166]]

 [[2027   12]
  [ 113   59]]

 [[2119   11]
  [  55   26]]

 [[2209    2]
  [   0    0]]]


In [57]:
acc = execute(NB_pipeline_TfidfV)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


0it [00:00, ?it/s]

Test accuracy for Touching /Groping is 0.80958842152872
precision_recall_fscore_support_weighted (0.8277488609111777, 0.80958842152872, 0.7802314627029168, None)
Test accuracy for Catcalls/Whistles is 0.8018995929443691
precision_recall_fscore_support_weighted (0.8329350935745726, 0.8018995929443691, 0.7430148100895752, None)
Test accuracy for Sexual Invites is 0.8959746720940751
precision_recall_fscore_support_weighted (0.8773441346781542, 0.8959746720940751, 0.8498069032689688, None)
Test accuracy for Stalking is 0.919041157847128
precision_recall_fscore_support_weighted (0.909489804746973, 0.919041157847128, 0.8824182316545488, None)
Test accuracy for Others is 0.8864767073722297
precision_recall_fscore_support_weighted (0.899416919202189, 0.8864767073722297, 0.836943811991307, None)
Test accuracy for Commenting is 0.7697874265038445
precision_recall_fscore_support_weighted (0.7817173285583136, 0.7697874265038445, 0.7533023441468268, None)
Test accuracy for Rape / Sexual Assault is 

  _warn_prf(average, modifier, msg_start, len(result))


Test accuracy for North East India Report is 0.9963817277250113
precision_recall_fscore_support_weighted (0.9927765473442786, 0.9963817277250113, 0.9945758704930674, None)


  _warn_prf(average, modifier, msg_start, len(result))


Test accuracy for Indecent Exposure/Masturbation in public is 0.937584803256445
precision_recall_fscore_support_weighted (0.9414839871181973, 0.937584803256445, 0.9082625184653872, None)
Test accuracy for Chain Snatching is 0.9674355495251018
precision_recall_fscore_support_weighted (0.9685017798271448, 0.9674355495251018, 0.9554948746370279, None)
Test accuracy for Ogling/Facial Expressions/Staring is 0.798733604703754
precision_recall_fscore_support_weighted (0.8085067501314411, 0.798733604703754, 0.7269267247531233, None)
Test accuracy for Taking pictures is 0.9253731343283582
precision_recall_fscore_support_weighted (0.9309599913319068, 0.9253731343283582, 0.892429099628566, None)
Test accuracy for Poor / No Street Lighting is 0.9656264133876075
precision_recall_fscore_support_weighted (0.9668106348665476, 0.9656264133876075, 0.9507393289514605, None)
Test accuracy for Online Harassment is 1.0
precision_recall_fscore_support_weighted (1.0, 1.0, 1.0, None)
mean:  0.9014343865090133


# Testing the Model

Test case 1

In [58]:
test_string = [""""He touched me in private areas, and accussed me to do sexual things"""]

acc = execute(NB_pipeline_CountV, X_test = test_string)
acc = execute(NB_pipeline_TfidfV,  X_test = test_string)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


0it [00:00, ?it/s]

Prediction for Touching /Groping is [1]
Prediction for Catcalls/Whistles is [0]
Prediction for Sexual Invites is [0]
Prediction for Stalking is [0]
Prediction for Others is [0]
Prediction for Commenting is [0]
Prediction for Rape / Sexual Assault is [0]
Prediction for North East India Report is [0]
Prediction for Indecent Exposure/Masturbation in public is [0]
Prediction for Chain Snatching is [0]
Prediction for Ogling/Facial Expressions/Staring is [0]
Prediction for Taking pictures is [0]
Prediction for Poor / No Street Lighting is [0]
Prediction for Online Harassment is [0]


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


0it [00:00, ?it/s]

Prediction for Touching /Groping is [1]
Prediction for Catcalls/Whistles is [0]
Prediction for Sexual Invites is [0]
Prediction for Stalking is [0]
Prediction for Others is [0]
Prediction for Commenting is [0]
Prediction for Rape / Sexual Assault is [0]
Prediction for North East India Report is [0]
Prediction for Indecent Exposure/Masturbation in public is [0]
Prediction for Chain Snatching is [0]
Prediction for Ogling/Facial Expressions/Staring is [0]
Prediction for Taking pictures is [0]
Prediction for Poor / No Street Lighting is [0]
Prediction for Online Harassment is [0]


Test case 2

In [59]:
test_string = [""" He is stalking , sexual assualt and taking pictures"""]

acc = execute(NB_pipeline_CountV, X_test = test_string)
acc = execute(NB_pipeline_TfidfV,  X_test = test_string)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


0it [00:00, ?it/s]

Prediction for Touching /Groping is [0]
Prediction for Catcalls/Whistles is [0]
Prediction for Sexual Invites is [0]
Prediction for Stalking is [0]
Prediction for Others is [0]
Prediction for Commenting is [0]
Prediction for Rape / Sexual Assault is [0]
Prediction for North East India Report is [0]
Prediction for Indecent Exposure/Masturbation in public is [0]
Prediction for Chain Snatching is [0]
Prediction for Ogling/Facial Expressions/Staring is [0]
Prediction for Taking pictures is [1]
Prediction for Poor / No Street Lighting is [0]
Prediction for Online Harassment is [0]


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


0it [00:00, ?it/s]

Prediction for Touching /Groping is [0]
Prediction for Catcalls/Whistles is [0]
Prediction for Sexual Invites is [0]
Prediction for Stalking is [0]
Prediction for Others is [0]
Prediction for Commenting is [0]
Prediction for Rape / Sexual Assault is [0]
Prediction for North East India Report is [0]
Prediction for Indecent Exposure/Masturbation in public is [0]
Prediction for Chain Snatching is [0]
Prediction for Ogling/Facial Expressions/Staring is [0]
Prediction for Taking pictures is [0]
Prediction for Poor / No Street Lighting is [0]
Prediction for Online Harassment is [0]
