### Loading the data

In [1]:
# Required libraries

import pandas as pd
import numpy as np
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
from collections import Counter
import en_core_web_sm
import geotext
from geotext import GeoText
%matplotlib inline

In [2]:
# Import data
df = pd.read_csv("dataset_parse_drop.csv", index_col = [0])

In [3]:
# Columns
df.columns

Index(['Title', 'Date', 'Station', 'Text', 'Tags'], dtype='object')

In [4]:
# No of articles
len(df)

6374

In [5]:
# Check for null values
df.isna().sum()

Title      0
Date       5
Station    1
Text       0
Tags       0
dtype: int64

In [6]:
# Deal with null values
df['Station'].fillna(df['Station'].mode(), inplace = True)
df['Date'].fillna("Unknown", inplace = True)

In [7]:
# Remove punctuations and stop words

df["Processed_Text"] = df["Text"].str.replace("[^\w\s]"," ")
df["Processed_Text"] = df["Processed_Text"].str.replace(pat = "[\s]+", repl = " ", regex = True)
df["Processed_Text"] = df["Processed_Text"].apply(lambda row: word_tokenize(row))
stopwords = nltk.corpus.stopwords.words("english")
df["Processed_Text"] = df["Processed_Text"].apply(lambda x: [item for item in x if item not in stopwords])
df["Processed_Text"] = df["Processed_Text"].apply(lambda x: ' '.join([item for item in x]))

### Extraction of People

In [8]:
# Extract people involved in each article

def find_people(document):
    nlp = en_core_web_sm.load()
    tokens = nlp(document)
    all_people = []
    people = []
    
    for ent in tokens.ents:
        if ent.label_ == 'PERSON':
            all_people.append(ent.text)
    all_people = list(set(all_people))
    
    for person in all_people:
        if " " in person:
            people.append(person)
    
    if len(people) == 0:
        return "Unknown"
    return people

df["People"] = df["Processed_Text"].apply(lambda row: find_people(row))

In [9]:
df["People"].iloc[0]

['Marie Hamilton',
 'Armida Castro',
 'Jose Alejandro Navarette Castro',
 'Adam Barcenas']

### Extraction of Places

In [10]:
# Extract places involved in each article

def find_places(document):
    places = GeoText(document)
    cities = list(set(places.cities))

    if len(cities) == 0:
        return "Unknown"
    return cities

df["Places"] = df["Processed_Text"].apply(lambda row: find_places(row))

In [11]:
df["Places"].iloc[0]

['Castro', 'Oxnard']

### Extraction of Descriptions

In [12]:
# Tag parts of speech to words 
def preprocess(text):
    text = text.replace(".", ". ")
    text = nltk.word_tokenize(text)
    text = nltk.pos_tag(text)
    return text

# Extract descriptions based on a pattern
def find_descriptions(document):
    text = preprocess(document)
    pattern = 'NP: {<DT>?<JJ>+<NNP>*<NN>+}'
    chunker = nltk.RegexpParser(pattern)
    tagged_phrases = str(chunker.parse(text))
    tagged_phrases = tagged_phrases.split('\n')
    
    chunks = []
    for string in tagged_phrases:
        if '(NP' in string and 'NN)' in string:
            chunks.append(string)
    chunks = [x.replace('(NP', '').replace('/DT', '').replace('/NN)', '').replace('/NN', '').replace('/JJ', '').replace('/NNP', '').replace('   ', '') for x in chunks]
    
    document = document.replace(".", ". ")
    sentences = sent_tokenize(document)
    for sentence in sentences:
        if "describe" in sentence:
            chunks.append(sentence)
            
    return chunks

df['Descriptions'] = df['Text'].apply(lambda row: find_descriptions(row))

In [13]:
df['Descriptions'].iloc[14]

['a domestic violence incident',
 'a tumultuous dating relationship',
 'domestic violence',
 'initial investigation',
 'the rear cargo hatch',
 'domestic violence',
 'a white female',
 'brown hair',
 'a white male',
 'brown hair',
 'Anonymous information',
 'Custer is described as a white female, 5 feet 8 inches tall, weighing 140 pounds, with brown hair and green eyes.',
 'Camou is described as a white male, 6 feet tall, weighing 150 pounds, with brown hair and brown eyes.']

### Classification

In [14]:
# Perform lemmatization

lemmatizer = WordNetLemmatizer()

def lemmatize_word(tagged_token):
    """ Returns lemmatized word given its tag"""
    root = []
    for token in tagged_token:
        tag = token[1][0]
        word = token[0]
        if tag.startswith('J'):
            root.append(lemmatizer.lemmatize(word, wordnet.ADJ))
        elif tag.startswith('V'):
            root.append(lemmatizer.lemmatize(word, wordnet.VERB))
        elif tag.startswith('N'):
            root.append(lemmatizer.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('R'):
            root.append(lemmatizer.lemmatize(word, wordnet.ADV))
        else:          
            root.append(word)
    return root

def lemmatize_doc(document):
    """ Tags words then returns sentence with lemmatized words"""
    lemmatized_list = []
    tokenized_sentence = sent_tokenize(document)
    for sentence in tokenized_sentence:
        tokenized_word = word_tokenize(sentence)
        tagged_token = pos_tag(tokenized_word)
        lemmatized = lemmatize_word(tagged_token)
        lemmatized_list.extend(lemmatized)
    return " ".join(lemmatized_list)

df["Processed_Text"] = df["Processed_Text"].apply(lambda row: lemmatize_doc(row))

In [15]:
# Create vocabulary to differentiate violent and non-violent crimes 
vocabulary = ["attack", "attacked", "attacking", "armed", "gun", "guns", "abduct", "abducted", "abducting", "kidnap", "kidnapped", "kidnapping", "shoot", "shot", "shooting", "kill", "killed", "killing", "murder", "murdered", "murdering", "assault", "assaulting", "assaulted", "beat", "beating", "kick", "kicked", "kicking"]

In [16]:
# Classification

def find_violence(document):
    return any(word in document for word in vocabulary)

df["Violent_Crime"] = df["Processed_Text"].apply(lambda row: find_violence(row))

In [17]:
df["Processed_Text"].iloc[0]

'OXNARD Calif KABC A transient smile attack 71 year old grandmother 80 year old man sentence six year prison Wednesday The grandmother Armida Castro die week hospitalize injury In emotional hearing Ventura County Superior Court Castro family ask strong penalty 56 year old defendant Adam Barcenas We hop see heinous act beat kick helpless ail woman This merely abuse This evil cowardly act say Castro son law Jose Alejandro Navarette Castro family understand Barcenas try homicide According medical examiner Castro die eight day attack blood clot cause injury But opinion conclusive The Ventura County District Attorney say Castro pre exist medical condition put risk strong evidence need prove beyond reasonable doubt Barcenas cause death Instead face potential life sentence Barcenas plead guilty two elder abuse felony misdemeanor resist arrest sentence six year state prison Barcenas suffers mental illness stabilizes treatment accord report cite court His sentence would less six year Barcenas p

In [18]:
df["Violent_Crime"].iloc[0]

True

### Violent and Non-violent Crimes

In [19]:
df_violent = df[df['Violent_Crime'] == True]
df_non_violent = df[df['Violent_Crime'] == False]

In [20]:
df_violent['Text'].iloc[10]

'MONROVIA, Calif. (KABC) -- Authorities are trying to find a woman they believe was kidnapped following a domestic violence incident in Monrovia.Investigators are trying to find Amanda Kathleen Custer, 31, after they say they found indications she had been assaulted and then placed in the back of a car in the 600 block of Vaquero Road Monday morning.The suspect has been identified as Robert Anthony Camou, 27.Investigators say Custer and Camou had been in a tumultuous dating relationship over the last two years, one marked by multiple allegations of domestic violence.Detectives believe Custer was kidnapped in a 2017 gray Toyota Prius, California license plate 8AOR167.Officers found blood at the scene and accounts from witnesses that Custer was placed in the back of the Prius."Officers learned during their initial investigation this morning that there was blood at the scene," said Lt. Scott Hoglund with the Los Angeles County Sheriff\'s Department, which is assisting Monrovia police. "Th

In [21]:
df_violent['Text'].iloc[100]

'HAWTHORNE, Calif.  (KABC) -- A woman is dead and a man seriously injured after a hit-and-run crash Saturday night in Hawthorne, authorities say.Police say the two victims were walking around 1:45 a.m. along Prairie Avenue, near El Segundo Boulevard, when a driver hit them and took off.Evidence at the scene led police to a home in Hawthorne, where they arrested a suspect for driving under the influence and hit-and-run.Another driver struck a Hawthorne police car during that investigation.Nobody was hurt in that crash and police say driving under the influence was not the cause for the second crash.'

In [22]:
df_non_violent['Text'].iloc[10]

'NORTHRIDGE, Calif.  (KABC) -- A search is underway for three burglary suspects who got away after authorities descended on a marijuana dispensary in Northridge early Thursday morning.When the SWAT team arrived at the Circle of Hope Alliance dispensary on Roscoe Boulevard around 6 a.m., they swept through the property, but nobody was found in the building.Police said the three possibly armed male suspects went inside the building and managed to get away.The scene was cleared within hours, but authorities have not released updates on their search.'

In [23]:
df_non_violent['Text'].iloc[100]

'PERRIS, Calif. (KABC) -- One shopper\'s horrific experience has her thinking twice about ever going back to a Perris Walmart.She says a man grabbed her leg and tried to take a picture up her skirt using a cellphone as she was buying groceries for her children.Now, she wants to warn other shoppers before he does something like this again."I can\'t sleep, I can\'t eat," said the woman, Rosa. "I think about not only my safety but the safety of my children, the safety of so many women who go in that store."The Mead Valley woman says she was shopping at the Walmart supercenter on Perris Boulevard in Perris Friday night around 6:30 p.m. An unknown man approached her in one of the food aisles."As he turned the corner he had this determined look in his eye, he was coming towards me."She says at first she thought he was just walking quickly, and was going to pass by.But she says the guy put his hand on her legs, and tried to take a picture up her skirt using a cellphone."As soon as I felt it i

### Types of Crimes

#### Crimes :
0. Ambiguous
1. Shootings
2. Assault / Domestic Violence / Abuse
3. Car Crashes / Chases / Carjackings
4. Homicides
5. Sexual Crimes
6. Robberies
7. Abductions
8. Drug-related Crimes
9. Fraud / Scam
10. Terrorism / Bombings / Bomb Threats
11. Vandalism
12. Protests
13. Police brutality

In [24]:
def TypeOfCrime(list_):
    
    if 'shoot' in list_:
        return 1
    elif 'assault' in list_ or 'abuse' in list_ or 'domestic violence' in list_:
        return 2
    elif 'crash' in list_ or 'drunk' in list_ or 'carjacking' in list_ or 'racing' in list_ or 'hit and run' in list_ or 'road rage' in list_ or 'dui' in list_ or 'chase' in list_:
        return 3
    elif 'murder' in list_ or 'homicide' in list_ or 'death' in list_ or 'killed' in list_:
        return 4
    elif 'sex' in list_ or 'nude' in list_ or 'rape' in list_ or 'porn' in list_ or 'naked' in list_ or 'peeping' in list_ or 'prostitution' in list_ or 'human trafficking' in list_ or 'molestation' in list_ or 'stalking' in list_:
        return 5
    elif 'robbery' in list_ or 'burglary' in list_ or 'theft' in list_ or 'stolen' in list_ or 'shoplifting' in list_:
        return 6
    elif 'abduct' in list_ or 'kidnap' in list_ or 'missing' in list_ or 'amber alert' in list_:
        return 7
    elif 'drug' in list_ or 'meth' in list_ or 'marijuana' in list_:
        return 8
    elif 'fraud' in list_ or 'identity theft' in list_ or 'scam' in  list_:
        return 9
    elif 'terror' in list_ or 'bomb' in list_:
        return 10
    elif 'vandalism' in list_:
        return 11
    elif 'protest' in list_:
        return 12
    elif 'police brutality' in list_:
        return 13
    else:
        return 0

In [25]:
df['TypeOfCrime'] = df['Tags'].apply(lambda row : TypeOfCrime(row))

In [26]:
df['TypeOfCrime'].value_counts().sort_index()

0     1576
1     1533
2      625
3      634
4      626
5      154
6      357
7      220
8       84
9       88
10      57
11     134
12     216
13      70
Name: TypeOfCrime, dtype: int64

### Output :

In [27]:
df.to_csv("NLP.csv")