In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import radians, sin, cos, sqrt, atan2
import random
from datetime import datetime
import time
import re

from IPython.display import clear_output
%config NotebookApp.iopub_msg_rate_limit=100000000
%config NotebookApp.rate_limit_window=20.0

In [2]:
df_1 = pd.read_csv('crime.csv')

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

df_1['Descript'] = df_1['Descript'].apply(remove_punctuation)
df_1['Descript'] = df_1['Descript'].astype(str).str.lower()

In [3]:
categories = df_1['Category'].unique().tolist()
descripts = df_1['Descript'].unique().tolist()
daysOfWeek = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
districts = df_1['PdDistrict'].unique().tolist()
resolutions = df_1['Resolution'].unique().tolist()

In [4]:
random.seed(333)

def randIDX(df, category, num):
    filtered_df = df.loc[df.loc[:,'Category'] == category]
    indices = filtered_df.index.tolist()
    random_indices = random.sample(indices, min(num, len(indices)))
    return random_indices

In [5]:
random.seed(333)

trainNum = 50000
testNum = 20000

trainIDX = []
testIDX = []

staticIDX = []
for i in range(len(categories)):
    staticIDX.append(randIDX(df_1, categories[i], 5))

while len(trainIDX) < trainNum:
    for i in range(len(staticIDX)):
        for j in range(len(staticIDX[i])):
            trainIDX.append(staticIDX[i][j])
    num = random.randint(0, len(df_1)-1)
    if not num in trainIDX:
        trainIDX.append(num)
        
while len(testIDX) < testNum:
    num = random.randint(0, len(df_1)-1)
    if not num in testIDX and not num in trainIDX:
        testIDX.append(num)

testDF = df_1.iloc[testIDX].reset_index(drop=True)
trainDF = df_1.iloc[trainIDX].reset_index(drop=True)

In [6]:
#common words that do not help clasify category
wordsToExclude = ['of', 'or', 'a', 'by', 'the', 'on', 'from', 'to', 'an', 'for', 'in']

def findWords(df, category, current, total):
    locDF = df.loc[df.loc[:,'Category'] == category]
    
    currIteration = 1
    allWords = []
    for descript in locDF['Descript']:
        words = descript.split()
        for word in words:
            if not word in wordsToExclude:
                allWords.append(word)
        percentageDone = round((((currIteration/len(locDF))*33.33333)*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1

    dictionary = {'category':[], 'topWord':[], 'count':[], 'proportionOfAllWords':[]}
    word_counts = pd.DataFrame(dictionary)
    
    currIteration = 1

    for word in allWords:
        if word in word_counts['topWord'].tolist():
            index = word_counts[word_counts['topWord'] == word].index[0]
            word_counts.loc[index, 'count'] += 1
        else:
            new_word = pd.DataFrame({'category':[category], 'topWord':[word], 'count':[1]})
            word_counts = pd.concat([word_counts, new_word], ignore_index=True)
        percentageDone = round(((((currIteration/len(allWords))*33.33333)+33.33333)*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1
    
    for j in range(len(word_counts)):
        word_counts.loc[j, 'proportionOfAllWords'] = word_counts.loc[j, 'count']/len(allWords)
        percentageDone = round((((((j+1)/len(word_counts))*33.33333)+66.66667)*(1/total))+(((current-1)/total)*100),2)
        time.sleep(0.001)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
         
    word_counts = word_counts.sort_values(by='count', ascending=False).reset_index(drop=True)
    word_counts = word_counts.drop('count', axis=1).reset_index(drop=True)  
#     return word_counts.reset_index(drop=True).loc[[0]]
    if len(word_counts) > 3:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': [word_counts.loc[2, 'topWord']],
        'fourthWord': [word_counts.loc[3, 'topWord']],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    elif len(word_counts) > 2:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': [word_counts.loc[2, 'topWord']],
        'fourthWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    elif len(word_counts) > 1:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': ['null'],
        'fourthWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    else:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': ['null'],
        'thirdWord': ['null'],
        'fourthWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}

In [7]:
def all_segments_3(sentence):
    words = sentence.split()
    segments = []
    for i in range(len(words) - 2):  # Change the range to len(words) - 2 for 3-word segments
        segment = " ".join(words[i:i+3])  # Adjust segment creation for 3 words
        segments.append(segment)
    return segments

def all_segments_2(sentence):
    words = sentence.split()
    segments = []
    for i in range(len(words) - 1):  # Change the range to len(words) - 2 for 3-word segments
        segment = " ".join(words[i:i+2])  # Adjust segment creation for 3 words
        segments.append(segment)
    return segments

def findSegments(df, category, current, total):
    locDF = df.loc[df.loc[:,'Category'] == category]
    
    currIteration = 1
    allSegments = []
    for descript in locDF['Descript']:
        segments = []
        if len(descript.split()) > 2:
            segments = all_segments_2(descript)
        else:
            segments.append(descript)
        for seg in segments:
            if not seg in allSegments:
                allSegments.append(seg)
        percentageDone = round(((((currIteration/len(locDF))*100))*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1
        
    dictionary = {'category':[category], 'allSegs':[allSegments]}
    segments = pd.DataFrame(dictionary)
    
    return segments

In [8]:
words_df = {'category':[], 'topWord':[], 'secondWord':[], 'thirdWord':[], 'fourthWord':[], 'proportionOfAllWords':[]}
words_df = pd.DataFrame(words_df)

for i in range(len(categories)):
    row = pd.DataFrame(findWords(trainDF, categories[i], (i+1), len(categories)))
    words_df = pd.concat([words_df, row], ignore_index=True)

100.0% done, category: 39/39


In [9]:
# findSegments(trainDF, categories[3], 1, 1)

In [10]:
segments_df = pd.DataFrame()

for i in range(len(categories)):
    row = pd.DataFrame(findSegments(trainDF, categories[i], (i+1), len(categories)))
    segments_df = pd.concat([segments_df, row], ignore_index=True)

100.0% done, category: 39/39


In [11]:
segments_df

Unnamed: 0,category,allSegs
0,WARRANTS,"[warrant arrest, enroute to, to outside, outsi..."
1,OTHER OFFENSES,"[drivers license, license suspended, suspended..."
2,LARCENY/THEFT,"[grand theft, theft from, from locked, locked ..."
3,VEHICLE THEFT,"[recovered vehicle, vehicle stolen, stolen out..."
4,VANDALISM,"[malicious mischief, mischief vandalism, vanda..."
5,NON-CRIMINAL,"[found property, death report, report cause, c..."
6,ROBBERY,"[carjacking with, with a, a gun, robbery bodil..."
7,ASSAULT,"[threats against, against life, battery, firea..."
8,WEAPON LAWS,"[poss of, of firearm, firearm by, by convicted..."
9,BURGLARY,"[burglary of, of apartment, apartment house, h..."


In [12]:
def row_Matches(df, rowToCheck):
    numOfMatches = 0
    for i in range(len(df)):
        if rowToCheck['category'] == df.loc[i, 'category']:
            numOfMatches += 1
    if numOfMatches > 0:
        return True
    else:
        return False

def clasify(unknown, tree):
    words = unknown['Descript'].split()
    possibleCats = pd.DataFrame()
    for word in words:
        row1 = tree.loc[tree.loc[:, 'topWord'] == word]
        possibleCats = pd.concat([possibleCats, row1], ignore_index=True).reset_index(drop=True)
        row2 = tree.loc[tree.loc[:, 'secondWord'] == word]
        if not row2.empty and not row_Matches(possibleCats, row2.iloc[0]):  # Check if row2 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row2], ignore_index=True).reset_index(drop=True)
        row3 = tree.loc[tree.loc[:, 'thirdWord'] == word]
        if not row3.empty and not row_Matches(possibleCats, row3.iloc[0]):  # Check if row3 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row3], ignore_index=True).reset_index(drop=True)
        row4 = tree.loc[tree.loc[:, 'fourthWord'] == word]
        if not row4.empty and not row_Matches(possibleCats, row4.iloc[0]):  # Check if row3 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row4], ignore_index=True).reset_index(drop=True)
            
    wordMatches = []
    for i in range(len(possibleCats)):
        matches = 0
        if possibleCats.loc[i, 'topWord'] in words:
            matches += 1
        if possibleCats.loc[i, 'secondWord'] in words:
            matches += 1
        if possibleCats.loc[i, 'thirdWord'] in words:
            matches += 1
        if possibleCats.loc[i, 'fourthWord'] in words:
            matches += 1
        wordMatches.append(matches)
        
    if len(possibleCats) > 1:
        maxValue = max(wordMatches)
        maxIDX = wordMatches.index(maxValue)
        possibleCats = possibleCats.iloc[[maxIDX]].reset_index(drop=True)
        
        return possibleCats.loc[0, 'category']
    
    if len(possibleCats) > 0:
        possibleCats = possibleCats.reset_index(drop=True)
        return possibleCats.loc[0, 'category']
    else:
        return 'unsure'

In [13]:
answers = pd.DataFrame()
for i in range(len(testDF)):
    x = clasify(testDF.iloc[i], words_df)
    y = 0
    if x == testDF.loc[i, 'Category']:
        y = 1
    row = pd.DataFrame({'actual': [testDF.loc[i, 'Category']], 'predicted': [x], 'correct': [y]})
    answers = pd.concat([answers, row], ignore_index=True).reset_index(drop=True)
    
    percentageDone = round((i+1)/len(testDF)*100,2)
    time.sleep(0.001)
    clear_output(wait=True)
    print(str(percentageDone) + '% done')

100.0% done


In [14]:
percentCorrect = (answers['correct'].sum()/len(answers))*100
percentCorrect

74.57000000000001

In [15]:
def findCats(df, df_1, stringList, key):
    for i in range(len(df)):
        if df.loc[i, key] in stringList:
            row = df.iloc[[i]]
            if not row_Matches(df_1, row.iloc[0]):
                df_1 = pd.concat([df_1, row], ignore_index=True).reset_index(drop=True)
    df = df_1
    return df
        
def tree(unknownDescript, df):
    words = unknownDescript.split()
    words.append('null')
    
    possibleCats = pd.DataFrame()
    possibleCats_1 = pd.DataFrame()
    
    for i in range(len(df)):
        if df.loc[i, 'topWord'] in words:
            row = df.iloc[[i]]
            if not row_Matches(possibleCats, row.iloc[0]):
                possibleCats = pd.concat([possibleCats, row], ignore_index=True).reset_index(drop=True)
#         if df.loc[i, 'secondWord'] in words:
#             row = df.iloc[[i]]
#             if not row_Matches(possibleCats, row.iloc[0]):
#                 possibleCats = pd.concat([possibleCats, row], ignore_index=True).reset_index(drop=True)
#         if df.loc[i, 'thirdWord'] in words:
#             row = df.iloc[[i]]
#             if not row_Matches(possibleCats, row.iloc[0]):
#                 possibleCats = pd.concat([possibleCats, row], ignore_index=True).reset_index(drop=True)
            
#     print(possibleCats)
                
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    if len(possibleCats) > 0:
        possibleCats_1 = pd.DataFrame()
        possibleCats = findCats(possibleCats, possibleCats_1, words, 'secondWord')
        
#     print('-----------------------------')
#     print(possibleCats)
        
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    if len(possibleCats) > 0:
        possibleCats_1 = pd.DataFrame()
        possibleCats = findCats(possibleCats, possibleCats_1, words, 'thirdWord')
        
#     print('-----------------------------')
#     print(possibleCats)
        
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    if len(possibleCats) > 0:
        possibleCats_1 = pd.DataFrame()
        possibleCats = findCats(possibleCats, possibleCats_1, words, 'fourthWord')
        
#     print('-----------------------------')
#     print(possibleCats)
        
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    else:
        return 'unsure'

In [16]:
tree_answers = pd.DataFrame()    
for i in range(len(testDF)):
    x = tree(testDF.loc[i, 'Descript'], words_df)
    y = 0
    if x == testDF.loc[i, 'Category']:
        y = 1
    row = pd.DataFrame({'actual': [testDF.loc[i, 'Category']], 'predicted': [x], 'correct': [y]})
    tree_answers = pd.concat([tree_answers, row], ignore_index=True).reset_index(drop=True)
    
    percentageDone = round((i+1)/len(testDF)*100,2)
    time.sleep(0.001)
    clear_output(wait=True)
    print(str(percentageDone) + '% done')

100.0% done


In [17]:
percentCorrect_tree = (tree_answers['correct'].sum()/len(tree_answers))*100
percentCorrect_tree

63.995000000000005

In [18]:
def seg_tree(unknownDescript, df):
    segments = []
    if len(unknownDescript.split()) > 2:
        segments = all_segments_2(unknownDescript)
    else:
        segments.append(unknownDescript)
        
    possibleCats = pd.DataFrame()
    
    for i in range(len(df)):
        for seg in df.loc[i, 'allSegs']:
            if seg in segments:
                row = df.iloc[[i]]
                if not row_Matches(possibleCats, row.iloc[0]):
                    possibleCats = pd.concat([possibleCats, row], ignore_index=True).reset_index(drop=True)
            
    segMatches = []
    for i in range(len(possibleCats)):
        matches = 0 
        for seg in possibleCats.loc[i, 'allSegs']:
            if seg in segments:
                matches += 1
        segMatches.append(matches)
        
    if len(possibleCats) > 0:
        maxValue = max(segMatches)
        maxIDX = segMatches.index(maxValue)
        possibleCats = possibleCats.iloc[[maxIDX]].reset_index(drop=True)
        
        return possibleCats.loc[0, 'category']
    else:
        return 'unsure'

In [19]:
seg_answers = pd.DataFrame()    
for i in range(len(testDF)):
    x = seg_tree(testDF.loc[i, 'Descript'], segments_df)
    y = 0
    if x == testDF.loc[i, 'Category']:
        y = 1
    row = pd.DataFrame({'actual': [testDF.loc[i, 'Category']], 'predicted': [x], 'correct': [y]})
    seg_answers = pd.concat([seg_answers, row], ignore_index=True).reset_index(drop=True)
    
    percentageDone = round((i+1)/len(testDF)*100,2)
    time.sleep(0.001)
    clear_output(wait=True)
    print(str(percentageDone) + '% done')

100.0% done


In [20]:
percentCorrect_segments = (seg_answers['correct'].sum()/len(seg_answers))*100
percentCorrect_segments

88.03

In [39]:
idx = 1590
word2 = seg_tree(testDF.loc[idx, 'Descript'], segments_df)
word2

'LARCENY/THEFT'

In [40]:
word = testDF.loc[idx, 'Category']
word

'NON-CRIMINAL'

In [41]:
testDF.loc[idx, 'Descript']

'lost property'

In [42]:
num = segments_df.loc[segments_df['category'] == word].index[0]
segments_df.loc[num, 'allSegs']

['found property',
 'death report',
 'report cause',
 'cause unknown',
 'lost property',
 'aided case',
 'case mental',
 'mental disturbed',
 'case dog',
 'dog bite',
 'case closure',
 'stay away',
 'away or',
 'or court',
 'court order',
 'order nondv',
 'nondv related',
 'case property',
 'property for',
 'for destruction',
 'shelter']

In [43]:
num2 = segments_df.loc[segments_df['category'] == word2].index[0]
segments_df.loc[num2, 'allSegs']

['grand theft',
 'theft from',
 'from locked',
 'locked auto',
 'petty theft',
 'theft of',
 'of property',
 'lost property',
 'property petty',
 'theft shoplifting',
 'from a',
 'a building',
 'from person',
 'theft bicycle',
 'attempted theft',
 'locked vehicle',
 'theft pickpocket']

In [36]:
all_segments_2(testDF.loc[idx, 'Descript'])

['lost property']