In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import radians, sin, cos, sqrt, atan2
import random
from datetime import datetime
import time
import re

from IPython.display import clear_output
%config NotebookApp.iopub_msg_rate_limit=100000000
%config NotebookApp.rate_limit_window=20.0

In [2]:
df_1 = pd.read_csv('crime.csv')

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

df_1['Descript'] = df_1['Descript'].apply(remove_punctuation)
df_1['Descript'] = df_1['Descript'].astype(str).str.lower()
df_1.head(20)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,warrant arrest,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,traffic violation arrest,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,traffic violation arrest,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
5,2015-05-13 23:30:00,LARCENY/THEFT,grand theft from unlocked auto,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431
6,2015-05-13 23:30:00,VEHICLE THEFT,stolen automobile,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138
7,2015-05-13 23:30:00,VEHICLE THEFT,stolen automobile,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564
8,2015-05-13 23:00:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601
9,2015-05-13 23:00:00,LARCENY/THEFT,grand theft from locked auto,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802


In [3]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [4]:
categories = df_1['Category'].unique().tolist()
descripts = df_1['Descript'].unique().tolist()
daysOfWeek = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
districts = df_1['PdDistrict'].unique().tolist()
resolutions = df_1['Resolution'].unique().tolist()

In [5]:
random.seed(7)

def randIDX(df, category, num):
    filtered_df = df.loc[df.loc[:,'Category'] == category]
    indices = filtered_df.index.tolist()
    random_indices = random.sample(indices, min(num, len(indices)))
    return random_indices

In [6]:
random.seed(7)

trainNum = 7500
testNum = 5000

trainIDX = []
testIDX = []

staticIDX = []
for i in range(len(categories)):
    staticIDX.append(randIDX(df_1, categories[i], 5))

while len(trainIDX) < trainNum:
    for i in range(len(staticIDX)):
        for j in range(len(staticIDX[i])):
            trainIDX.append(staticIDX[i][j])
    num = random.randint(0, len(df_1)-1)
    if not num in trainIDX:
        trainIDX.append(num)
        
while len(testIDX) < testNum:
    num = random.randint(0, len(df_1)-1)
    if not num in testIDX and not num in trainIDX:
        testIDX.append(num)

testDF = df_1.iloc[testIDX].reset_index(drop=True)
trainDF = df_1.iloc[trainIDX].reset_index(drop=True)

In [124]:
#common words that do not help clasify category
wordsToExclude = ['of', 'or', 'a', 'by', 'the', 'on', 'from', 'to', 'an', 'for', 'in']

def findWords(df, category, current, total):
    locDF = df.loc[df.loc[:,'Category'] == category]
    
    currIteration = 1
    allWords = []
    for descript in locDF['Descript']:
        words = descript.split()
        for word in words:
            if not word in wordsToExclude:
                allWords.append(word)
        percentageDone = round((((currIteration/len(locDF))*33.33333)*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1

    dictionary = {'category':[], 'topWord':[], 'count':[], 'proportionOfAllWords':[]}
    word_counts = pd.DataFrame(dictionary)
    
    currIteration = 1

    for word in allWords:
        if word in word_counts['topWord'].tolist():
            index = word_counts[word_counts['topWord'] == word].index[0]
            word_counts.loc[index, 'count'] += 1
        else:
            new_word = pd.DataFrame({'category':[category], 'topWord':[word], 'count':[1]})
            word_counts = pd.concat([word_counts, new_word], ignore_index=True)
        percentageDone = round(((((currIteration/len(allWords))*33.33333)+33.33333)*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1
    
    for j in range(len(word_counts)):
        word_counts.loc[j, 'proportionOfAllWords'] = word_counts.loc[j, 'count']/len(allWords)
        percentageDone = round((((((j+1)/len(word_counts))*33.33333)+66.66667)*(1/total))+(((current-1)/total)*100),2)
        time.sleep(0.001)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
         
    word_counts = word_counts.sort_values(by='count', ascending=False).reset_index(drop=True)
    word_counts = word_counts.drop('count', axis=1).reset_index(drop=True)  
#     return word_counts.reset_index(drop=True).loc[[0]]
    if len(word_counts) > 3:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': [word_counts.loc[2, 'topWord']],
        'fourthWord': [word_counts.loc[3, 'topWord']],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    elif len(word_counts) > 2:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': [word_counts.loc[2, 'topWord']],
        'fourthWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    elif len(word_counts) > 1:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': [word_counts.loc[1, 'topWord']],
        'thirdWord': ['null'],
        'fourthWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}
    else:
        return {'category': [word_counts.loc[0, 'category']],
        'topWord': [word_counts.loc[0, 'topWord']], 
        'secondWord': ['null'],
        'thirdWord': ['null'],
        'fourthWord': ['null'],
        'proportionOfAllWords': [word_counts.loc[0, 'proportionOfAllWords']]}

In [142]:
def all_segments(sentence):
    words = sentence.split()
    segments = []
    for i in range(len(words) - 1):
        segment = " ".join(words[i:i+2])
        segments.append(segment)
    return segments

def findSegments(df, category, current, total):
    locDF = df.loc[df.loc[:,'Category'] == category]
    
    currIteration = 1
    allSegments = []
    for descript in locDF['Descript']:
        segments = []
        if len(descript.split()) > 1:
            segments = all_segments(descript)
        else:
            segments.append(descript)
        for seg in segments:
            allSegments.append(seg)
        percentageDone = round(((((currIteration/len(locDF))*50))*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1
        
    dictionary = {'category':[], 'topSeg':[], 'count':[]}
    seg_counts = pd.DataFrame(dictionary)
    
    currIteration = 1
    for seg in allSegments:
        if seg in seg_counts['topSeg'].tolist():
            index = seg_counts[seg_counts['topSeg'] == seg].index[0]
            seg_counts.loc[index, 'count'] += 1
        else:
            new_seg = pd.DataFrame({'category':[category], 'topSeg':[seg], 'count':[1]})
            seg_counts = pd.concat([seg_counts, new_seg], ignore_index=True)
        percentageDone = round(((((currIteration/len(allSegments))*50)+50)*(1/total))+(((current-1)/total)*100),2)
        clear_output(wait=True)
        print(str(percentageDone) + '% done, category: ' + str(current) + '/' + str(total))
        time.sleep(0.001)
        currIteration += 1
        
    if len(seg_counts) > 3:
        return {'category': [seg_counts.loc[0, 'category']],
        'topSeg': [seg_counts.loc[0, 'topSeg']], 
        'secondSeg': [seg_counts.loc[1, 'topSeg']],
        'thirdSeg': [seg_counts.loc[2, 'topSeg']],
        'fourthSeg': [seg_counts.loc[3, 'topSeg']]}
    elif len(seg_counts) > 2:
        return {'category': [seg_counts.loc[0, 'category']],
        'topSeg': [seg_counts.loc[0, 'topSeg']], 
        'secondSeg': [seg_counts.loc[1, 'topSeg']],
        'thirdSeg': [seg_counts.loc[2, 'topSeg']],
        'fourthSeg': ['null']}
    elif len(seg_counts) > 1:
        return {'category': [seg_counts.loc[0, 'category']],
        'topSeg': [seg_counts.loc[0, 'topSeg']], 
        'secondSeg': [seg_counts.loc[1, 'topSeg']],
        'thirdSeg': ['null'],
        'fourthSeg': ['null']}
    else:
        return {'category': [seg_counts.loc[0, 'category']],
        'topSeg': [seg_counts.loc[0, 'topSeg']], 
        'secondSeg': ['null'],
        'thirdSeg': ['null'],
        'fourthSeg': ['null']}

In [8]:
df = {'category':[], 'topWord':[], 'secondWord':[], 'thirdWord':[], 'fourthWord':[], 'proportionOfAllWords':[]}
df = pd.DataFrame(df)

for i in range(len(categories)):
    row = pd.DataFrame(findWords(trainDF, categories[i], (i+1), len(categories)))
    df = pd.concat([df, row], ignore_index=True)

100.0% done, category: 39/39


In [144]:
findSegments(trainDF, categories[3], 1, 1)

100.0% done, category: 1/1


{'category': ['VEHICLE THEFT'],
 'topSeg': ['vehicle recovered'],
 'secondSeg': ['recovered motorcycle'],
 'thirdSeg': ['stolen truck'],
 'fourthSeg': ['stolen and']}

In [138]:
segments_df = {'category':[], 'topSeg':[], 'secondSeg':[], 'thirdSeg':[]}
segments_df = pd.DataFrame()

for i in range(len(categories)):
    row = pd.DataFrame(findSegments(trainDF, categories[i], (i+1), len(categories)))
    df = pd.concat([df, row], ignore_index=True)

100.0% done, category: 39/39


In [139]:
segments_df

In [92]:
def row_Matches(df, rowToCheck):
    numOfMatches = 0
    for i in range(len(df)):
        if rowToCheck['category'] == df.loc[i, 'category']:
            numOfMatches += 1
    if numOfMatches > 0:
        return True
    else:
        return False

def clasify(unknown, tree):
    words = unknown['Descript'].split()
    possibleCats = pd.DataFrame()
    for word in words:
        row1 = tree.loc[tree.loc[:, 'topWord'] == word]
        possibleCats = pd.concat([possibleCats, row1], ignore_index=True).reset_index(drop=True)
        row2 = tree.loc[tree.loc[:, 'secondWord'] == word]
        if not row2.empty and not row_Matches(possibleCats, row2.iloc[0]):  # Check if row2 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row2], ignore_index=True).reset_index(drop=True)
        row3 = tree.loc[tree.loc[:, 'thirdWord'] == word]
        if not row3.empty and not row_Matches(possibleCats, row3.iloc[0]):  # Check if row3 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row3], ignore_index=True).reset_index(drop=True)
        row4 = tree.loc[tree.loc[:, 'fourthWord'] == word]
        if not row4.empty and not row_Matches(possibleCats, row4.iloc[0]):  # Check if row3 is not empty before accessing its first row
            possibleCats = pd.concat([possibleCats, row4], ignore_index=True).reset_index(drop=True)
            
    wordMatches = []
    for i in range(len(possibleCats)):
        matches = 0
        if possibleCats.loc[i, 'topWord'] in words:
            matches += 1
        if possibleCats.loc[i, 'secondWord'] in words:
            matches += 1
        if possibleCats.loc[i, 'thirdWord'] in words:
            matches += 1
        if possibleCats.loc[i, 'fourthWord'] in words:
            matches += 1
        wordMatches.append(matches)
        
    if len(possibleCats) > 1:
        maxValue = max(wordMatches)
        maxIDX = wordMatches.index(maxValue)
        possibleCats = possibleCats.iloc[[maxIDX]].reset_index(drop=True)
        
        return possibleCats.loc[0, 'category']
    
    if len(possibleCats) > 0:
        possibleCats = possibleCats.reset_index(drop=True)
        return possibleCats.loc[0, 'category']
    else:
        return 'unsure'

In [91]:
answers = pd.DataFrame()
for i in range(len(testDF)):
    x = clasify(testDF.iloc[i], df)
    y = 0
    if x == testDF.loc[i, 'Category']:
        y = 1
    row = pd.DataFrame({'actual': [testDF.loc[i, 'Category']], 'predicted': [x], 'correct': [y]})
    answers = pd.concat([answers, row], ignore_index=True).reset_index(drop=True)
    
    percentageDone = round((i+1)/len(testDF)*100,2)
    time.sleep(0.001)
    clear_output(wait=True)
    print(str(percentageDone) + '% done, set 1/2')
    
answers_excluded = pd.DataFrame()    
for i in range(len(testDF)):
    x = clasify(testDF.iloc[i], df_excluded)
    y = 0
    if x == testDF.loc[i, 'Category']:
        y = 1
    row = pd.DataFrame({'actual': [testDF.loc[i, 'Category']], 'predicted': [x], 'correct': [y]})
    answers_excluded = pd.concat([answers_excluded, row], ignore_index=True).reset_index(drop=True)
    
    percentageDone = round((i+1)/len(testDF)*100,2)
    time.sleep(0.001)
    clear_output(wait=True)
    print(str(percentageDone) + '% done, set 2/2')
    

3.44% done, set 1/2


KeyboardInterrupt: 

In [13]:
percentCorrect = (answers['correct'].sum()/len(answers))*100
percentCorrect

82.96

In [14]:
percentCorrect_excluded = (answers_excluded['correct'].sum()/len(answers_excluded))*100
percentCorrect_excluded

83.48

In [116]:
def findCats(df, df_1, words, wordToCheck):
    for i in range(len(df)):
        if df.loc[i, wordToCheck] in words:
            row = df.iloc[[i]]
            if not row_Matches(df_1, row.iloc[0]):
                df_1 = pd.concat([df_1, row], ignore_index=True).reset_index(drop=True)
    df = df_1
    return df
        
def tree(unknownDescript, df):
    words = unknownDescript.split()
    words.append('null')
    
    possibleCats = pd.DataFrame()
    possibleCats_1 = pd.DataFrame()
    
    for i in range(len(df)):
        if df.loc[i, 'topWord'] in words:
            row = df.iloc[[i]]
            if not row_Matches(possibleCats, row.iloc[0]):
                possibleCats = pd.concat([possibleCats, row], ignore_index=True).reset_index(drop=True)
#         if df.loc[i, 'secondWord'] in words:
#             row = df.iloc[[i]]
#             if not row_Matches(possibleCats, row.iloc[0]):
#                 possibleCats = pd.concat([possibleCats, row], ignore_index=True).reset_index(drop=True)
#         if df.loc[i, 'thirdWord'] in words:
#             row = df.iloc[[i]]
#             if not row_Matches(possibleCats, row.iloc[0]):
#                 possibleCats = pd.concat([possibleCats, row], ignore_index=True).reset_index(drop=True)
            
#     print(possibleCats)
                
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    if len(possibleCats) > 0:
        possibleCats_1 = pd.DataFrame()
        possibleCats = findCats(possibleCats, possibleCats_1, words, 'secondWord')
        
#     print('-----------------------------')
#     print(possibleCats)
        
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    if len(possibleCats) > 0:
        possibleCats_1 = pd.DataFrame()
        possibleCats = findCats(possibleCats, possibleCats_1, words, 'thirdWord')
        
#     print('-----------------------------')
#     print(possibleCats)
        
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    if len(possibleCats) > 0:
        possibleCats_1 = pd.DataFrame()
        possibleCats = findCats(possibleCats, possibleCats_1, words, 'fourthWord')
        
#     print('-----------------------------')
#     print(possibleCats)
        
    if len(possibleCats) == 1:
        return possibleCats.loc[0, 'category']
    else:
        return 'unsure'

In [122]:
tree_answers = pd.DataFrame()    
for i in range(len(testDF)):
    x = tree(testDF.loc[i, 'Descript'], df_excluded)
    y = 0
    if x == testDF.loc[i, 'Category']:
        y = 1
    row = pd.DataFrame({'actual': [testDF.loc[i, 'Category']], 'predicted': [x], 'correct': [y]})
    tree_answers = pd.concat([tree_answers, row], ignore_index=True).reset_index(drop=True)
    
    percentageDone = round((i+1)/len(testDF)*100,2)
    time.sleep(0.001)
    clear_output(wait=True)
    print(str(percentageDone) + '% done')

100.0% done


In [123]:
percentCorrect_tree = (tree_answers['correct'].sum()/len(tree_answers))*100
percentCorrect_tree

68.22

In [120]:
tree_answers.head(20)

Unnamed: 0,actual,predicted,correct
0,LARCENY/THEFT,LARCENY/THEFT,1
1,LARCENY/THEFT,LARCENY/THEFT,1
2,FRAUD,FRAUD,1
3,PROSTITUTION,PROSTITUTION,1
4,DRUG/NARCOTIC,DRUNKENNESS,0
5,VANDALISM,VANDALISM,1
6,MISSING PERSON,unsure,0
7,VANDALISM,VANDALISM,1
8,OTHER OFFENSES,unsure,0
9,LARCENY/THEFT,LARCENY/THEFT,1


In [108]:
tree(testDF.loc[6, 'Descript'], df_excluded)

Empty DataFrame
Columns: []
Index: []
-----------------------------
Empty DataFrame
Columns: []
Index: []
-----------------------------
Empty DataFrame
Columns: []
Index: []
-----------------------------
Empty DataFrame
Columns: []
Index: []


'unsure'

In [121]:
testDF.iloc[[4]]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
4,2003-09-06 18:57:00,DRUG/NARCOTIC,under influence of drugs in a public place,Saturday,CENTRAL,"ARREST, BOOKED",200 Block of GEARY ST,-122.407435,37.787516
