## Production script for Classification

This script highlights how the trained linear SVM model is to be loaded and run against the data of interest to classify text into their appropriate categories. The codes also identifies the location of event and the activity causing it where possible. The code assumes that the data is extracted in bulk in a CSV or Excel format with the column of interest labelled CONTENT. If the column is named otherwise, change the name accordingly throughout this script or in the data file (preferable). The processed data after classification will be re-saved in the original extension as the original file in the main directory as this script.

In [1]:
import os
import pandas as pd
import numpy as np
import gensim
from sklearn import utils
import regex as re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import dill as pickle

### Text preprocessing

Below we define a function to convert text to lower-case and strip punctuation/symbols from words and so on.

In [2]:
def cleanText(text):
    text = re.sub(r'#', r'apartment unit ', text)
    text = re.sub(r'['+string.punctuation+']', r" ", text)
    text = re.sub(r'[0-9]+', r' ', text)
    text = re.sub('\s+', " ", text).strip()
    text = text.lower()
    text = re.sub(r"\bv deck\b", r"voiddeck", text)
    text = re.sub(r"\bvd\b", r"voiddeck", text)
    text = re.sub(r"\bv d\b", r"voiddeck", text)
    text = re.sub(r"\bvdeck\b", r"voiddeck", text)
    text = re.sub(r"\bvoid deck\b", r"voiddeck", text)
    text = re.sub(r"\bfbck\b", r"feedback", text)
    text = re.sub(r"\bfbk\b", r"feedback", text)
    text = re.sub(r"\bf back\b", r"feedback", text)
    text = re.sub(r"\bfback\b", r"feedback", text)
    text = re.sub(r"\bfup\b", r"feedback", text)
    text = re.sub(r"\bfb\b", r"feedback", text)
    text = re.sub(r"\bres\b", r"resident", text)
    text = re.sub(r"\bresd\b", r"resident", text)
    text = re.sub(r"\bresid\b", r"resident", text)
    text = re.sub(r"\breno\b", r"renovation", text)
    text = re.sub(r"\bneighbour\b", r"neighbor", text)
    text = re.sub(r"\bneighbours\b", r"neighbor", text)
    text = re.sub(r"\bneighbors\b", r"neighbor", text)
    text = re.sub(r"\breq\b", r"request", text)
    text = re.sub(r"\brq\b", r"request", text)
    text = re.sub(r"\binfd\b", r"informed", text)
    text = re.sub(r"\badv\b", r"advice", text)
    text = re.sub(r"\badvic\b", r"advice", text)
    text = re.sub(r"\badvis\b", r"advice", text)
    text = re.sub(r"\badvise\b", r"advice", text)
    text = re.sub(r"\badviceeise\b", r"advice", text)
    text = re.sub(r"\bppl\b", r"people", text)
    text = re.sub(r"\bindians\b", r"indian", text)
    text = re.sub(r"\bmalays\b", r"malay", text)
    text = re.sub(r"\bment\b", r"apartment", text)
    text = re.sub(r"\btcil\b", r"towncouncil", text)
    text = re.sub(r"\btown council\b", r"towncouncil", text)
    text = re.sub(r"\bcoffee shop\b", r"coffeeshop", text)
    text = re.sub(r"\breno\b", r"renovation", text)
    text = re.sub(r"\brenov\b", r"renovation", text)
    text = re.sub(r"\bcpark\b", r"carpark", text)
    text = re.sub(r"\bcar park\b", r"carpark", text)
    text = re.sub(r"\bmscp\b", r"multi-storey carpark", text)
    text = re.sub(r"\bmulti storey\b", r"multi-storey", text)
    text = re.sub(r"\bmultistorey\b", r"multi-storey", text)
    text = re.sub(r"\broad work\b", r"roadworks", text)
    text = re.sub(r"\broad works\b", r"roadworks", text)
    text = re.sub(r"\bfrm\b", r"from", text)
    text = re.sub(r"\bgrp\b", r"group", text)
    text = re.sub(r"\bhse\b", r"house", text)
    text = re.sub(r"\bcan t\b", r"cannot", text)
    text = re.sub(r"\bgrp\b", r"group", text)
    text = re.sub(r"\bsaket bord\b", r"skateboard", text)
    text = re.sub(r"\bw out\b", r"without", text)
    text = re.sub(r"\bongg\b", r"ongoing", text)
    text = re.sub(r"\bon gg\b", r"ongoing", text)
    text = re.sub(r"\bfwake\b", r"funeral wake", text)
    text = re.sub(r"\bcorr\b", r"corridor", text)
    text = re.sub(r"\bcorri\b", r"corridor", text)
    text = re.sub(r"\bcorridors\b", r"corridor", text)
    text = re.sub(r"\bpg\b", r"playground", text)
    text = re.sub(r"\bpgd\b", r"playground", text)
    text = re.sub(r"\bpgrd\b", r"playground", text)
    text = re.sub(r"\blway\b", r"linkway", text)
    text = re.sub(r"\bl way\b", r"linkway", text)
    text = re.sub(r"\bcontratcor\b", r"linkway", text)
    text = re.sub(r"\bos\b", r"outside", text) 
    text = re.sub(r"\bo s\b", r"outside", text) 
    text = re.sub(r"\binfd\b", r"informed", text)
    text = re.sub(r"\bfball\b", r"football", text)
    text = re.sub(r"\bfoot ball\b", r"football", text)
    text = re.sub(r"\brtop\b", r"rooftop", text)
    text = re.sub(r"\br top\b", r"rooftop", text)
    text = re.sub(r"\bramping\b", r"revving", text)
    text = re.sub(r"\bramming\b", r"revving", text)
    text = re.sub(r"\bcolln\b", r"collection", text)
    text = re.sub(r"\bcontruction\b", r"construction", text)
    text = re.sub(r"\bconctruction\b", r"construction", text)
    text = re.sub(r"\bpurposed\b", r"purpose", text)
    text = re.sub(r"\bmph\b", r"multi-purpose hall", text)
    text = re.sub(r"\bmphall\b", r"multi-purpose hall", text)
    text = re.sub(r"\bmulti purpose\b", r"multi-purpose", text)
    text = re.sub(r"\bmultipurpose\b", r"multi-purpose", text)
    text = re.sub(r"\bmulti purpos\b", r"multi-purpose", text)
    text = re.sub(r"\bfurnitures\b", r"furniture", text)
    text = re.sub(r"\bchairs\b", r"chair", text)
    text = re.sub(r"\btables\b", r"table", text)
    text = re.sub(r"\bcheers\b", r"cheer", text)
    text = re.sub(r"\bsongs\b", r"song", text)
    text = re.sub(r"\bsepak takraw\b", r"sepaktakraw", text)
    text = re.sub(r"\bbadmointon\b", r"badminton", text)
    text = re.sub(r"\bbadminton court\b", r"badmintoncourt", text)
    text = re.sub(r"\bb minton\b", r"badminton", text)
    text = re.sub(r"\bbasket ball\b", r"basketball", text)
    text = re.sub(r"\bbasketball court\b", r"basketballcourt", text)
    text = re.sub(r"\btakewondo\b", r"taekwondo", text)
    text = re.sub(r"\btaichi\b", r"tai chi", text)
    text = re.sub(r"\bgathing\b", r"gathering", text)
    text = re.sub(r"\bsoccer court\b", r"soccercourt", text)
    text = re.sub(r"\bfootball court\b", r"footballcourt", text)
    return text

The function below tokenizes the text data in each row (document) into individual words. Sentences smaller than 3 words and words that correspond to stopwords are discarded.

In [3]:
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def tokenize_text(text, stop_words):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 3 or word in stop_words:
                continue
            tokens.append(word.lower())
    return tokens

## Pre-processing Text Data for classification

In [4]:
# change below accordingly to where the file is located
filename = './Raw Data/Noise_Data-20180201-20180430_with_postal_code.xlsx'

if filename.lower().endswith('.csv'):
    df = pd.read_csv(filename, encoding='latin1')
elif filename.lower().endswith('.xlsx'):
    df = pd.read_excel(filename)
else:
    raise ValueError('Wrong file format! Please use CSV or Excel.')

# Display first 10 rows of the data
df.head(10)

Unnamed: 0,INFMT_TEL,CONTENT,INC_TYPE,REPORT_TIME,INC_LATITUDE,INC_LONGTITUDE,Postal Code
0,pOi2Y,A LOT OF DRUNK MAN MAKING NOISE. WANT TO SLEEP...,Noise Pollution,01/02/2018 00:01:53,1.445663,103.798129,730768
1,qvMak,A GROUP OF TEENAGERS HAVE BEEN MAKING NOISE FO...,Noise Pollution,01/02/2018 00:03:03,1.366885,103.954505,510121
2,+9sdG,A BUNCH OF DRUNK YOUNGSTERS IS MAKING SO MUCH ...,Noise Pollution,01/02/2018 00:03:04,1.436902,103.841032,761269
3,plWnf,GROUP OF MOTORCYCLIST REVVING THEIR ENGINE.,Noise Pollution,01/02/2018 00:04:29,1.354098,103.95875,520382
4,5D1pB,SOMEONE PLAYING LOUD MUSIC OUTSIDE MY CONDO,Noise Pollution,01/02/2018 00:06:15,1.314288,103.890696,398026
5,WuWwB,This KTV at Katong Plaza ground floor is very ...,Noise Pollution,01/02/2018 00:09:04,1.304163,103.903875,429979
6,p5tM4,SOME IS PLAYING THE REMOTE CONTROL CAR AT THE ...,Noise Pollution,01/02/2018 00:09:31,1.387174,103.907775,540160
7,oXXCA,"Top floor neighbor making alot of noise, It se...",Noise Pollution,01/02/2018 00:12:34,1.323714,103.738649,600020
8,+9sdG,OUR NEIGHBOR SON OPEN THE DOOR AND MAKING A LO...,Noise Pollution,01/02/2018 00:16:41,1.30555,103.863563,190010
9,HzzhM,"Group of people at the void deck very noisy, c...",Noise Pollution,01/02/2018 00:16:52,1.382495,103.764046,670543


In [5]:
# Process the data in CONTENT and put the processed data in a new column
df['CONTENT_clean'] = df['CONTENT'].apply(cleanText)

In [6]:
df['CONTENT'].iloc[30] # view example of original data

'Group of people causing noise near the mookata shop. Please advise'

In [7]:
df['CONTENT_clean'].iloc[30] # view examples of the cleaned data (before removal of stopwords)

'group of people causing noise near the mookata shop please advice'

Define locations of noise and the activities causing it.

In [8]:
# Tag locations and activity to complaints where possible
location = ["void deck",
            "multi-purpose hall",
            "multi-purpose court",
            "multi-purpose plaza",
            "corridor",
            "basketballcourt", 
            "badmintoncourt",
            "soccercourt",
            "footballcourt",
            "playground",
            "carpark",
            "multi-storey carpark",
            "coffeeshop"]

activity = ["mahjong",
            "babies", 
            "crying",
            "dismantling",
            "dismantle",
            "furniture", 
            "drag",
            "hack",
            "push", 
            "drag",
            "kick",            
            "bang",
            "table",
            "chairs", 
            "pull",
            "shout",
            "talk",
            "cheer",
            "play",
            "dance",
            "dancing"
            "music",
            "song",
            "dance",
            "sepaktakraw",
            "skateboard", 
            "wedding", 
            "funeral", 
            "party", 
            "taekwondo",
            "exercise",
            "exercising",
            "wushu",
            "tai chi"]

activity2 = ['basketball',
             'football',
             'soccer',
             'badminton']

In [9]:
df['Location'] = ""

location_lst = "|".join(location)

ind = df.index[df['CONTENT_clean'].str.contains(location_lst)]

for i in ind:
    tmp = []
    for word in location:
        if word in df['CONTENT_clean'].loc[i]:
            tmp.append(word)
    if tmp:
        df['Location'].loc[i] = ','.join(tmp)

df['Location'].head(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


0                                 
1                                 
2                                 
3                                 
4                                 
5                                 
6     carpark,multi-storey carpark
7                                 
8                                 
9                                 
10                                
11                      coffeeshop
12                      coffeeshop
13                                
14                                
15                                
16                                
17                                
18                                
19                                
20                                
21                                
22                                
23                                
24                                
25                                
26                                
27                                
28                  

In [10]:
df['Activity'] = ""

activity_lst = "|".join(activity)

ind = df.index[df['CONTENT_clean'].str.contains(activity_lst)]

for i in ind:
    tmp = []
    for word in activity:
        if word in df['CONTENT_clean'].loc[i]:
            tmp.append(word)
    for word in activity2:
        if re.search(r'\b' + word + r'\b', df['CONTENT_clean'].loc[i]):
            tmp.append(word)
    if tmp:
        df['Activity'].loc[i] = ','.join(tmp)

df['Activity'].head(30)

0          
1          
2          
3          
4      play
5          
6      play
7      bang
8      talk
9          
10         
11         
12         
13         
14         
15         
16         
17     talk
18         
19         
20         
21         
22     bang
23    party
24         
25         
26         
27         
28         
29         
Name: Activity, dtype: object

Tokenize the test data.

In [11]:
X_test = []
for i in range(df['CONTENT_clean'].shape[0]):
    X_test.append(tokenize_text(df['CONTENT_clean'].iloc[i], stop_words))

This is what a data entry looks like.

In [12]:
X_test[0]

['lot', 'drunk', 'man', 'making', 'noise', 'want', 'sleep', 'also']

## Running classifer on test data

In [13]:
# load the model from disk
svc_tfidf = pickle.load(open('noise_categories_model_svm_tfidf.pkl', 'rb'))

In [14]:
df['Category Prediction'] = svc_tfidf.predict(X_test)

In [15]:
df = df.replace(np.nan, '', regex=True)

#save data either to csv or excel depending on the original data file
filename_s = os.path.splitext(os.path.basename(filename))[0] + ' Predicted Categories'
if filename.lower().endswith('.csv'):
    df.to_csv(filename_s + '.csv', index = False)
else:
    writer = pd.ExcelWriter(filename_s + '.xlsx')
    df.to_excel(writer,'Sheet1',index=False)
    writer.save()