In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import radians, sin, cos, sqrt, atan2
import random
from datetime import datetime
import time
import re

from IPython.display import clear_output
%config NotebookApp.iopub_msg_rate_limit=100000000
%config NotebookApp.rate_limit_window=20.0

In [2]:
df_1 = pd.read_csv('crime.csv')

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

df_1['Descript'] = df_1['Descript'].apply(remove_punctuation)
df_1['Descript'] = df_1['Descript'].astype(str).str.lower()
df_1.head(20)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,warrant arrest,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,traffic violation arrest,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,traffic violation arrest,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
5,2015-05-13 23:30:00,LARCENY/THEFT,grand theft from unlocked auto,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431
6,2015-05-13 23:30:00,VEHICLE THEFT,stolen automobile,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138
7,2015-05-13 23:30:00,VEHICLE THEFT,stolen automobile,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564
8,2015-05-13 23:00:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601
9,2015-05-13 23:00:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802


In [3]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [4]:
categories = df_1['Category'].unique().tolist()
descripts = df_1['Descript'].unique().tolist()
daysOfWeek = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
districts = df_1['PdDistrict'].unique().tolist()
resolutions = df_1['Resolution'].unique().tolist()

In [5]:
random.seed(7)

def randIDX(df, category, num):
    filtered_df = df.loc[df.loc[:,'Category'] == category]
    indices = filtered_df.index.tolist()
    random_indices = random.sample(indices, min(num, len(indices)))
    return random_indices

In [6]:
random.seed(7)

trainNum = 10000
testNum = 3500

trainIDX = []
testIDX = []

staticIDX = []
for i in range(len(categories)):
    staticIDX.append(randIDX(df_1, categories[i], 5))

while len(trainIDX) < trainNum:
    for i in range(len(staticIDX)):
        for j in range(len(staticIDX[i])):
            trainIDX.append(staticIDX[i][j])
    num = random.randint(0, len(df_1)-1)
    if not num in trainIDX:
        trainIDX.append(num)
        
while len(testIDX) < testNum:
    num = random.randint(0, len(df_1)-1)
    if not num in testIDX and not num in trainIDX:
        testIDX.append(num)

testDF = df_1.iloc[testIDX].reset_index(drop=True)
trainDF = df_1.iloc[trainIDX].reset_index(drop=True)

In [7]:
#common words that do not help clasify category
wordsToExclude = ['of', 'or', 'a', 'by', 'the', 'on', 'from']

def findWords(df, category, current, total):
    locDF = df.loc[df.loc[:,'Category'] == category]
    
    currIteration = 1
    allWords = []
    for descript in locDF['Descript']:
        words = descript.split()
        for word in words:
            if not word in wordsToExclude:
                allWords.append(word)
        percentageDone = round((((currIteration/len(locDF))*33.33333)*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1

    dictionary = {'category':[], 'topWord':[], 'count':[], 'proportionOfAllWords':[]}
    word_counts = pd.DataFrame(dictionary)
    
    currIteration = 1

    for word in allWords:
        if word in word_counts['topWord'].tolist():
            index = word_counts[word_counts['topWord'] == word].index[0]
            word_counts.loc[index, 'count'] += 1
        else:
            new_word = pd.DataFrame({'category':[category], 'topWord':[word], 'count':[1]})
            word_counts = pd.concat([word_counts, new_word], ignore_index=True)
        percentageDone = round(((((currIteration/len(allWords))*33.33333)+33.33333)*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1
    
    for j in range(len(word_counts)):
        word_counts.loc[j, 'proportionOfAllWords'] = word_counts.loc[j, 'count']/len(allWords)
        percentageDone = round((((((j+1)/len(word_counts))*33.33333)+66.66667)*(1/total))+(((current-1)/total)*100),2)
        time.sleep(0.001)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
         
    word_counts = word_counts.sort_values(by='count', ascending=False).reset_index(drop=True)
    word_counts = word_counts.drop('count', axis=1).reset_index(drop=True)  
#     return word_counts.reset_index(drop=True).loc[[0]]
    if len(word_counts) > 2:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': [word_counts.loc[2, 'topWord']],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    elif len(word_counts) > 1:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    else:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': ['null'],
        'thirdWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}

In [8]:
df = {'category':[], 'topWord':[], 'secondWord':[], 'thirdWord':[], 'proportionOfAllWords':[]}
df = pd.DataFrame(df)

for i in range(len(categories)):
    row = pd.DataFrame(findWords(trainDF, categories[i], (i+1), len(categories)))
    df = pd.concat([df, row], ignore_index=True)

df.head()

100.0% done, category: 39/39


Unnamed: 0,category,topWord,secondWord,thirdWord,proportionOfAllWords
0,WARRANTS,warrant,arrest,enroute,0.332278
1,OTHER OFFENSES,violation,license,police,0.163202
2,LARCENY/THEFT,theft,grand,locked,0.27732
3,VEHICLE THEFT,stolen,vehicle,recovered,0.307692
4,VANDALISM,malicious,mischief,vandalism,0.277895


In [14]:
df

Unnamed: 0,category,topWord,secondWord,thirdWord,proportionOfAllWords
0,WARRANTS,warrant,arrest,enroute,0.332278
1,OTHER OFFENSES,violation,license,police,0.163202
2,LARCENY/THEFT,theft,grand,locked,0.27732
3,VEHICLE THEFT,stolen,vehicle,recovered,0.307692
4,VANDALISM,malicious,mischief,vandalism,0.277895
5,NON-CRIMINAL,aided,case,property,0.154179
6,ROBBERY,robbery,street,with,0.293919
7,ASSAULT,battery,threats,against,0.952727
8,WEAPON LAWS,firearm,carrying,weapon,0.095628
9,BURGLARY,entry,burglary,unlawful,0.237965


In [15]:
len(df['topWord'].unique())

38

In [90]:
def row_Matches(df, rowToCheck):
    numOfMatches = 0
    for index, row in df.iterrows():  # Iterate over rows
        if rowToCheck.equals(row):
            numOfMatches += 1
    if numOfMatches > 0:
        return True
    else:
        return False

def clasify(unknown, tree):
    words = unknown['Descript'].split()
    possibleCats = pd.DataFrame()
    for word in words:
        row1 = tree.loc[tree.loc[:, 'topWord'] == word]
        possibleCats = pd.concat([possibleCats, row1], ignore_index=True)
        row2 = tree.loc[tree.loc[:, 'secondWord'] == word]
        if not row2.empty and not row_Matches(possibleCats, row2.iloc[0]):  # Check if row2 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row2], ignore_index=True)
        row3 = tree.loc[tree.loc[:, 'thirdWord'] == word]
        if not row3.empty and not row_Matches(possibleCats, row3.iloc[0]):  # Check if row3 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row3], ignore_index=True)
    
    print(possibleCats)
    wordMatches = []
    
    for i in range(len(possibleCats)):
        matches = 0
        for word in words:
            if word in possibleCats.loc[i, 'topWord']:
                matches += 1
            if word in possibleCats.loc[i, 'secondWord']:
                matches += 1
            if word in possibleCats.loc[i, 'thirdWord']:
                matches += 1
        wordMatches.append(matches)
        
    maxValue = max(wordMatches)
    maxIDX = wordMatches.index(maxValue)
    
    possibleCats = possibleCats.iloc[maxIDX].reset_index(drop=True)
    return possibleCats


In [91]:
clasify(testDF.loc[935], df)

        category       topWord secondWord thirdWord  proportionOfAllWords
0  LARCENY/THEFT         theft      grand    locked              0.277320
1   EMBEZZLEMENT  embezzlement      theft     grand              0.277778
2  LARCENY/THEFT         theft      grand    locked              0.277320
3          FRAUD        credit       card     theft              0.210685


0    LARCENY/THEFT
1            theft
2            grand
3           locked
4          0.27732
Name: 0, dtype: object

In [92]:
print("descript: " + testDF.loc[935, 'Descript'] + ", category: " + testDF.loc[935, 'Category'])

descript: grand theft from locked auto, category: LARCENY/THEFT


In [93]:
df.loc[df.loc[:,'topWord'] == 'theft']

Unnamed: 0,category,topWord,secondWord,thirdWord,proportionOfAllWords
2,LARCENY/THEFT,theft,grand,locked,0.27732
