# Topic Modeling: Cleaning/Pre-Processing

### References/Useful Links:
NLTK Documentation: https://www.nltk.org/api/nltk.html?highlight=nltk%20text%20text  
Tutorial on Tokenization: https://www.guru99.com/tokenize-words-sentences-nltk.html  
NLTK List of English Stopwords: https://gist.github.com/sebleier/554280  
Regular Expression Documentation: https://docs.python.org/3/library/re.html  
See also LDA_tests.ipynb for tutorials on prepping texts for LDA.  

## 1. Import Libraries

In [129]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint

In [130]:
# import NLP libraries for preprocessing
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

## 2. Import Data
1. Convert dataframe column with charge descriptions to list.
2. Pad with space (' ') on front and back of each description.

In [131]:
# compas_path = 'compas-scores-two-years.csv'
url = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'
compas_df = pd.read_csv(url)
# sample charge descriptions
compas_df[['c_charge_desc']].head()

Unnamed: 0,c_charge_desc
0,Aggravated Assault w/Firearm
1,Felony Battery w/Prior Convict
2,Possession of Cocaine
3,Possession of Cannabis
4,arrest case no charge


In [184]:
# get non-NaN charge descriptions as a list
compas_df_charge_filt = compas_df[compas_df['c_charge_desc'].isna() == False]
charge_descs = list(compas_df_charge_filt['c_charge_desc'])
charge_descs = [' ' + desc + ' ' for desc in charge_descs]
print(len(charge_descs))
charge_descs[:10]

7185


[' Aggravated Assault w/Firearm ',
 ' Felony Battery w/Prior Convict ',
 ' Possession of Cocaine ',
 ' Possession of Cannabis ',
 ' arrest case no charge ',
 ' Battery ',
 ' Possession Burglary Tools ',
 ' arrest case no charge ',
 ' Battery ',
 ' Insurance Fraud ']

## 3. Define Word Replacement Mappings

Build dictionary with replacement word and all its mispellings as they appear in charge descriptions.

In [185]:
# make a dictionary with replacement (str) --> mispellings (list of str) mapping
replacement_map = {
    ' accident ': [' acc ', ' accd '],
    ' aggravated ': [' agg ', ' aggr '],
    ' alter ': [' alt '],
    ' ammunition ': [' amm '],
    ' amphetamine ': [' amp '],
    ' attempt ': [' att ', ' attmp '],
    ' attend ': [' attnd '],
    ' battery ': [' batt ', ' bat '],
    ' beverage ': [' bev '],
    ' bodily ': [' bod '],
    ' burglary ': ['burg ', ' burgl '],
    ' business ': [' busn '],
    ' cancel ': [' can '],
    ' counterfeit ': [' cntrft ', ' conterfeit ', ' contrft ', ' countrfeit'],
    ' commit ': [' com '],
    ' communication ': [' communic '],
    ' compensation ': [' compensatn '],
    ' conspiracy ': [' consp '],
    ' control ': [' cont ', ' contr '],
    ' conveyance ': [' conv ', ' conve '],
    ' control ': [' cont ', ' contr '],
    ' convict ': [' convic '],
    ' credit ': [' cred '],
    ' criminal ': [' crim '],
    ' cruelty ': [' crlty '],
    ' custody ': [' cust '],
    ' dangerous ': [' dang '],
    ' degree ': [' deg '],
    ' delinquency ': [' delinq '],
    ' delivery ': [' deliv ', ' del '],
    ' device ': [' dev '],
    ' display ': [' disply '],
    ' disqualified ': [' disqul '],
    ' disorderly conduct ': [' doc '],
    ' dollars ': [' dols '],
    ' d u i ': [' dui '],
    ' domestic ': [' dom ', ' dome '],
    ' drive ': [' driv ', ' drivg '],
    ' dwelling ': [' dwell '],
    ' elderly ': [' elderlly '],
    ' employee ': [' emplyee '],
    ' enforcement ': [' enfor ', ' enforc '],
    ' exhibition ': [' exhib '],
    ' extinguisher ': [' extinquisher '],
    ' facilitate ': [' fac '],
    ' failure ': [' fail '],
    ' family ': [' faml '],
    ' felony ': [' fel '],
    ' felon ': [' felo '],
    ' firearm ': [' f arm '],
    ' fraud ': [' frd ', ' fraudul'],
    ' gambling ': [' gamb '],
    ' gram ': [' g '], # check
    ' great ': [' grt '],
    ' informant ': ['informnt '],
    ' injunction ': [' inj ', ' injunc ', ' injunct ', ' injunctn '],
    ' instrument ': [' inst '],
    ' intent ': [' int '],
    ' interfere ': [' interf '],
    ' introduce ': [' intoduce '],
    ' lascivious ': [' lasc ', ' lasciv '],
    ' lease ': [' leas '],
    ' license ': [' lic ', ' licenc '],
    ' license tag ': [' lictag '],
    ' leave ': [' lve '],
    ' manufacture ': [' man ', ' mfr '],
    ' motor ': [' mot '],
    ' occupied ': [' occup ', ' occp '], # not to confuse with unoccupied -- need space
    ' offense ': [' offens ', ' offn '],
    ' operate ': [' oper ', ' opert '],
    ' permanent ': [' perm '],
    ' person ': [' pers ', ' persn ', ' persnl '],
    ' possession ': [' pos ', ' poss ', ' possess,'],
    ' private ': [' priv '],
    ' promise ': [' promis '],
    ' property ': [' prop '],
    ' public ': [' pub '],
    ' purchase ': [' pur '],
    ' railroad ': [' rail '],
    ' redilver ': [' redeliv '],
    ' revoke ': [' revk '],
    ' scene ': [' scen '],
    ' school ': [' ftsch ', ' scho ', ' scho '],
    ' sell ': [' sel '],
    ' sex ': [' sexual '], # for homogeneity
    ' solicit ': [' solic ', ' solict '],
    ' specialist ': [' speci '],
    ' strangulation ': [' strang '],
    ' structure ': [' struc ', ' struct '],
    ' substance ': [' sub ', ' subst ', ' substa '],
    ' sudden ': [' sudd '],
    ' suspended ': [' susp ', ' suspd '],
    ' traffick ': [' traf ', ' traff ', ' traffic '],
    ' transmit ': [' trans '],
    ' trespass ': [' tresspass '],
    ' trirail ': [' tri rail '], # make 1-wd (tri-rail specific to FLA)
    ' toward ': [' twrd '],
    ' unauthorized ': [' unauth '],
    ' uncovered ': [' uncov '],
    ' unlawful ': [' unl ', ' unlaw '],
    ' unoccupied ': [' unocc ', ' unoccup '],
    ' vehicle ': [' veh '],
    ' victim ': [' vict ', ' victm '],
    ' vehicle identification number ': [' vin '],
    ' violence ': [' viol ', ' vi '],
    ' weapon ': [' weap ', ' wep '],
}

abbrev_map = {
    'law enforcement officer ': ['leo '],
    'driving under influence ': ['dui '],
    'driving while intoxicated ': ['dwi '],
    'driving while license suspended ': ['dwls '],
    'firearm ': ['f arm '],
    'vehicle identification number ': ['vin ']
}

In [186]:
# make a dictionary with replacement (str) --> mispellings (list of str) mapping
r_replacement_map = {
    ' accident ': [' acc ', ' accd '],
    ' aggravated ': [' agg ', ' aggr '],
    ' alcohol ': [' alch '], # NEW
    ' alter ': [' alt '],
    ' ammunition ': [' amm '],
    ' amphetamine ': [' amp '],
    ' attempt ': [' att ', ' attmp '],
    ' attend ': [' attnd '],
    ' battery ': [' batt ', ' bat '],
    ' beverage ': [' bev '],
    ' bodily ': [' bod '],
    ' burglary ': ['burg ', ' burgl '],
    ' business ': [' busn ', ' bus '], # ADDED bus
    ' cancel ': [' can '],
    ' church ': [' chur '], # NEW
    ' counterfeit ': [' cntrft ', ' conterfeit ', ' contrft ', ' countrfeit'],
    ' commit ': [' com '],
    ' communication ': [' communic '],
    ' compensation ': [' compensatn '],
    ' conspiracy ': [' consp '],
    ' control ': [' cont ', ' contr '],
    ' conveyance ': [' conv ', ' conve '],
    ' control ': [' cont ', ' contr '],
    ' convict ': [' convic '],
    ' credit ': [' cred '],
    ' criminal ': [' crim ', ' crimin '], # ADDED crimin
    ' cruelty ': [' crlty '],
    ' custody ': [' cust '],
    ' damage ': [' damg '],
    ' dangerous ': [' dang '],
    ' defendant ': [' deft '], # NEW
    ' degree ': [' deg '],
    ' delinquency ': [' delinq '],
    ' delivery ': [' deliv ', ' del ', ' deliver '], # ADDED deliver
    ' depend ': [' depnd '], # NEW
    ' device ': [' dev '],
    ' display ': [' disply '],
    ' disqualified ': [' disqul '],
    ' disorderly conduct ': [' doc '],
    ' dollars ': [' dols '],
    ' d u i ': [' dui '],
    ' domestic ': [' dom ', ' dome '],
    ' drive ': [' driv ', ' drivg ', ' drv '], # ADDED drv
    ' dwelling ': [' dwell ', ' dwel '], # ADDED dwel
    ' electronic ': [' elec '], # NEW
    ' elderly ': [' elderlly '],
    ' employee ': [' emplyee '],
    ' enforcement ': [' enfor ', ' enforc '],
    ' engage ': [' eng '], # NEW
    ' establishment ': [' estab ', ' establishm '], # NEW
    ' exhibition ': [' exhib '],
    ' extinguisher ': [' extinquisher '],
    ' facilitate ': [' fac '],
    ' failure ': [' fail '],
    ' family ': [' faml '],
    ' felony ': [' fel '],
    ' felon ': [' felo '],
    ' firearm ': [' f arm '],
    ' fraud ': [' frd ', ' fraudul', ' fraud ent '], # ADDED fraud ent
    ' gambling ': [' gamb '],
    ' gram ': [' g '], # check
    ' great ': [' grt '],
    ' hours ': [' hrs '], # NEW
    ' informant ': ['informnt '],
    ' injunction ': [' inj ', ' injunc ', ' injunct ', ' injunctn '],
    ' instrument ': [' inst '],
    ' insurance ': [' insur '], # NEW
    ' intent ': [' int '],
    ' interfere ': [' interf ', ' intrf '], # ADDED intrf
    ' introduce ': [' intoduce '],
    ' lascivious ': [' lasc ', ' lasciv '],
    ' lease ': [' leas '],
    ' license ': [' lic ', ' licenc '],
    ' license tag ': [' lictag '],
    ' leave ': [' lve '],
    ' malicious ': [' malic '], # NEW
    ' manufacture ': [' man ', ' mfr ', ' mfg '], # ADDED mfg
    ' methadone ': [' methado '], # NEW
    ' minor ': [' min '], # NEW
    ' motor ': [' mot '],
    ' obtain ': [' obt '], # NEW
    ' occupied ': [' occup ', ' occp '],
    ' offense ': [' offens ', ' offn '],
    ' operate ': [' oper ', ' opert '],
    ' paraphernalia ': [' para '], # NEW
    ' pedestrian ': [' ped '], # NEW
    ' permanent ': [' perm '],
    ' person ': [' pers ', ' persn ', ' persnl ', ' prson '], # ADDED prson
    ' possession ': [' pos ', ' poss ', ' possess ', ' posses '],
    ' private ': [' priv '],
    ' promise ': [' promis '],
    ' property ': [' prop '],
    ' prostitute ': [' prostitut '], # NEW
    ' prostitution violation ': [' prostitutionviolation '], # NEW
    ' protect ': [' prot '], # NEW
    ' public ': [' pub '],
    ' purchase ': [' pur '],
    ' railroad ': [' rail '],
    ' receipt ': [' rcpt '],
    ' redilver ': [' redeliv '],
    ' registration ': [' reg '], # NEW
    ' responsibility ': [' resp '], # NEW
    ' revoke ': [' revk '],
    ' scene ': [' scen '],
    ' school ': [' ftsch ', ' scho ', ' scho ', ' sch '], # ADDED sch
    ' sell ': [' sel '],
    ' sex ': [' sexual '], # for homogeneity
    ' shop ': [' shp '], # NEW
    ' solicit ': [' solic ', ' solict ', ' sol '], # ADDED sol
    ' specialist ': [' speci '],
    ' strangulation ': [' strang '],
    ' structure ': [' struc ', ' struct '],
    ' substance ': [' sub ', ' subst ', ' substa '],
    ' sudden ': [' sudd '],
    ' suspended ': [' susp ', ' suspd '],
    ' traffick ': [' traf ', ' traff ', ' traffic '],
    ' obstruct traffic ': [' obstruct traffick '], # NEW (acct for traffic/traffick diff)
    ' transmit ': [' trans '],
    ' trespass ': [' tresspass '],
    ' trirail ': [' tri rail '], # make 1-wd (tri-rail specific to FLA)
    ' toward ': [' twrd '],
    ' unauthorized ': [' unauth ', ' unauthorizd '], # ADDED unauthorizd
    ' uncovered ': [' uncov '],
    ' under ': [' und '], # NEW
    ' unlawful ': [' unl ', ' unlaw '],
    ' unoccupied ': [' unocc ', ' unoccup '],
    ' vehicle ': [' veh '],
    ' verification ': [' verif '], # NEW
    ' victim ': [' vict ', ' victm '],
    ' vehicle identification number ': [' vin '],
    ' violence ': [' viol ', ' vi '],
    ' weapon ': [' weap ', ' wep '],
    ' witness ': [' wit '], # NEW
    ' years ': [' yrs '] # NEW
}

abbrev_map = {
    'law enforcement officer ': ['leo '],
    'driving under influence ': ['dui '],
    'driving while intoxicated ': ['dwi '],
    'driving while license suspended ': ['dwls '],
    'firearm ': ['f arm '],
    'vehicle identification number ': ['vin ']
}

## 4. Define Pre-processing Functions
1. **get_chars_to_rmv:** Get list of non-alpha characters and replace with space
2. **check_desc:** Get charge description containing a certain token (for reference).
3. **clean_descs:** Remove chars_to_rmv; replace mispellings and remove any excess whitespace.
4. **tokenize_descs:** Tokenize descriptions.
5. **stem_tokens:** Stem each token in each description (exclude English stop words and tokens < 3 chars in length).

In [187]:
# build a list of non-alpha chars to remove, which can be fed into a replacement function as a regex
def get_chars_to_rmv(charge_descs):
    chars=[]
    for d in charge_descs:
        for t in d:
            for c in t:
                if not c.isalpha() and c not in chars:
                    chars.append(c)
    chars_to_rmv = ''.join(sorted(chars)).strip()
    chars_to_rmv = '[' + chars_to_rmv + ']'
    return chars_to_rmv

In [188]:
# REMOVE WHEN DONE -- simply for checking full description of confusing tokens for context
def check_desc(keyword):
    for d in charge_descs:
        if keyword in d.lower():
            print(d)

In [189]:
def clean_descs(chars_to_rmv, charge_descs, replacement_map):
    # remove non-alpha characters
    charge_descs_clean = [re.sub(chars_to_rmv, ' ', desc.lower()) for desc in charge_descs]
    # replace mispellings using replacement_map
    for repl, mispellings in replacement_map.items():
        for misp in mispellings:
            charge_descs_clean = [re.sub(misp, repl, desc) for desc in charge_descs_clean]

    # remove extra spaces
    charge_descs_clean = [re.sub(' +', ' ', desc) for desc in charge_descs_clean]
    return charge_descs_clean

In [190]:
def tokenize_descs(charge_descs_clean):
    # use nltk tokenizer to tokenize each description
    tokenized_descs = [word_tokenize(desc.lower()) for desc in charge_descs_clean]
    # sample of tokenized descriptions
    return tokenized_descs

In [191]:
def stem_tokens(tokenized_descs):
    # make stemmer
    stemmer = SnowballStemmer(language='english')

    # get english stop words
    swds = stopwords.words('english')

    # list for toring tokenized & stemmed descriptions
    tokenized_stemmed = []
    # list for storing stemmed/token/description triplets
    stem_token_descs = []

    # stem each token in each description
    for desc in tokenized_descs:
        stemmed = [(stemmer.stem(token)) for token in desc if (len(token) > 2 and token not in swds)]
        tokenized_stemmed.append(stemmed)
        # also build a list containing stemmed token, original token, and the description it came from 
        # (for reference and debugging purposes)
        triplet = [(stemmer.stem(token), token, ' '.join(desc)) for token in desc if len(token) > 2]
        stem_token_descs.append(triplet)
    return tokenized_stemmed

## 5. Preprocess First Charge Descriptions

In [192]:
chars_to_rmv = get_chars_to_rmv(charge_descs)
chars_to_rmv

'[#$()+,-./0123456789<>]'

In [193]:
# clean descriptions
charge_descs_clean = clean_descs(chars_to_rmv, charge_descs, r_replacement_map)
# sample of cleaned charge descriptions
charge_descs_clean[:10]

[' aggravated assault w firearm ',
 ' felony battery w prior convict ',
 ' possession of cocaine ',
 ' possession of cannabis ',
 ' arrest case no charge ',
 ' battery ',
 ' possession burglary tools ',
 ' arrest case no charge ',
 ' battery ',
 ' insurance fraud ']

In [194]:
# tokenize
tokenized_descs = tokenize_descs(charge_descs_clean)
# sample of tokenized descriptions
tokenized_descs[:10]

[['aggravated', 'assault', 'w', 'firearm'],
 ['felony', 'battery', 'w', 'prior', 'convict'],
 ['possession', 'of', 'cocaine'],
 ['possession', 'of', 'cannabis'],
 ['arrest', 'case', 'no', 'charge'],
 ['battery'],
 ['possession', 'burglary', 'tools'],
 ['arrest', 'case', 'no', 'charge'],
 ['battery'],
 ['insurance', 'fraud']]

In [195]:
# stem and remove stopwords
tokenized_stemmed = stem_tokens(tokenized_descs)
tokenized_stemmed = [', '.join(desc) for desc in tokenized_stemmed]
# sample of stemmed, tokenized descriptions
pprint(tokenized_stemmed[:10])

# make sorted list of unique tokens (vocabulary)
flat_unq_tokens = sorted(set([token for desc in tokenized_stemmed for token in desc.split(', ')]))
print('\nnumber of unique tokens (vocab length):', len(flat_unq_tokens))
print('number of unique charge descriptions:', len(set([' '.join(desc) for desc in tokenized_stemmed])))

# sample of stemmed, tokenized descriptions
flat_unq_tokens[:10]

['aggrav, assault, firearm',
 'feloni, batteri, prior, convict',
 'possess, cocain',
 'possess, cannabi',
 'arrest, case, charg',
 'batteri',
 'possess, burglari, tool',
 'arrest, case, charg',
 'batteri',
 'insur, fraud']

number of unique tokens (vocab length): 421
number of unique charge descriptions: 406


['abet',
 'abus',
 'accessori',
 'accid',
 'act',
 'actual',
 'adult',
 'aggrav',
 'aggress',
 'agre']

In [144]:
d = {'id': compas_df_charge_filt['id'], 'name': compas_df_charge_filt['name'], 'charge description': compas_df_charge_filt['c_charge_desc'], 
     'tokenized_stemmed description': tokenized_stemmed}
charge_mappings = pd.DataFrame(data=d)

In [145]:
charge_mappings.head()

Unnamed: 0,id,name,charge description,tokenized_stemmed description
0,1,miguel hernandez,Aggravated Assault w/Firearm,"aggrav, assault, firearm"
1,3,kevon dixon,Felony Battery w/Prior Convict,"feloni, batteri, prior, convict"
2,4,ed philo,Possession of Cocaine,"possess, cocain"
3,5,marcu brown,Possession of Cannabis,"possess, cannabi"
4,6,bouthy pierrelouis,arrest case no charge,"arrest, case, charg"


## Preprocessing Second Charge Descriptions

In [146]:
# get non-NaN r_charge descriptions as a list
r_compas_df_charge_filt = compas_df[compas_df['r_charge_desc'].isna() == False]
r_charge_descs = list(r_compas_df_charge_filt['r_charge_desc'])
r_charge_descs = [' ' + desc + ' ' for desc in r_charge_descs]
print(len(r_charge_descs))
r_charge_descs[:10]

3413


[' Felony Battery (Dom Strang) ',
 ' Driving Under The Influence ',
 ' Poss of Firearm by Convic Felo ',
 ' Battery ',
 ' Driving License Suspended ',
 ' Grand Theft (Motor Vehicle) ',
 ' Criminal Mischief>$200<$1000 ',
 ' Grand Theft in the 3rd Degree ',
 ' Possession of Cocaine ',
 ' Poss Cocaine/Intent To Del/Sel ']

In [147]:
# get chars to remove from r crimes
r_chars_to_rmv = get_chars_to_rmv(r_charge_descs)
r_chars_to_rmv

'["$()+,-./0123456789<>]'

In [148]:
# sample of cleaned charge descriptions
r_charge_descs_clean = clean_descs(r_chars_to_rmv, r_charge_descs, r_replacement_map)
# sample of cleaned charge descriptions
r_charge_descs_clean[:20]

[' felony battery domestic strangulation ',
 ' driving under the influence ',
 ' possession of firearm by convict felon ',
 ' battery ',
 ' driving license suspended ',
 ' grand theft motor vehicle ',
 ' criminal mischief ',
 ' grand theft in the rd degree ',
 ' possession of cocaine ',
 ' possession cocaine intent to delivery sell ',
 ' prowling loitering ',
 ' operating w o valid license ',
 ' possession cannabis grams or less ',
 ' driving license suspended ',
 ' possession cannabis grams or less ',
 ' false imprisonment ',
 ' grand theft motor vehicle ',
 ' resist obstruct w o violence ',
 ' possession cannabis grams or less ',
 ' grand theft in the rd degree ']

In [149]:
# tokenize each description
r_tokenized_descs = tokenize_descs(r_charge_descs_clean)
# sample of tokenized descriptions
r_tokenized_descs[:10]

[['felony', 'battery', 'domestic', 'strangulation'],
 ['driving', 'under', 'the', 'influence'],
 ['possession', 'of', 'firearm', 'by', 'convict', 'felon'],
 ['battery'],
 ['driving', 'license', 'suspended'],
 ['grand', 'theft', 'motor', 'vehicle'],
 ['criminal', 'mischief'],
 ['grand', 'theft', 'in', 'the', 'rd', 'degree'],
 ['possession', 'of', 'cocaine'],
 ['possession', 'cocaine', 'intent', 'to', 'delivery', 'sell']]

In [150]:
# stem each token in each description & remove stop words
r_tokenized_stemmed = stem_tokens(r_tokenized_descs)
r_tokenized_stemmed = [', '.join(desc) for desc in r_tokenized_stemmed]
# sample of stemmed, tokenized descriptions
print(r_tokenized_stemmed[:10])

['feloni, batteri, domest, strangul', 'drive, influenc', 'possess, firearm, convict, felon', 'batteri', 'drive, licens, suspend', 'grand, theft, motor, vehicl', 'crimin, mischief', 'grand, theft, degre', 'possess, cocain', 'possess, cocain, intent, deliveri, sell']


In [151]:
d = {'id': r_compas_df_charge_filt['id'], 'name': r_compas_df_charge_filt['name'], 
     'charge description': r_compas_df_charge_filt['r_charge_desc'], 
     'r_tokenized_stemmed description': r_tokenized_stemmed}
r_charge_mappings = pd.DataFrame(data=d)
r_charge_mappings

Unnamed: 0,id,name,charge description,r_tokenized_stemmed description
1,3,kevon dixon,Felony Battery (Dom Strang),"feloni, batteri, domest, strangul"
2,4,ed philo,Driving Under The Influence,"drive, influenc"
6,8,edward riddle,Poss of Firearm by Convic Felo,"possess, firearm, convict, felon"
9,13,bo bradac,Battery,batteri
11,15,ellyaher lanza,Driving License Suspended,"drive, licens, suspend"
...,...,...,...,...
7201,10985,kyle miller,Operating W/O Valid License,"oper, valid, licens"
7205,10990,christopher tun,Assault,assault
7206,10992,alexander vega,Possess Cannabis/20 Grams Or Less,"possess, cannabi, gram, less"
7207,10994,jarred payne,Possession of Cannabis,"possess, cannabi"


## Prepare for Common Token Analysis / Visualization

In [152]:
# check lengths of df slices before merge
print('# rows total:', len(compas_df))
print('# rows where c_charge is non-NaN:', len(charge_mappings))
print('# rows where r_charge is non-NaN:', len(r_charge_mappings))
# get df rows where c_charge and r_charge both non-NaN
cr_idx = (compas_df['c_charge_desc'].isna() == False) & (compas_df['r_charge_desc'].isna() == False)
print('# rows where c_charge and r_charge both non-NaN:', len(cr_idx[cr_idx == True]))

# rows total: 7214
# rows where c_charge is non-NaN: 7185
# rows where r_charge is non-NaN: 3413
# rows where c_charge and r_charge both non-NaN: 3393


In [153]:
c_df_merged = compas_df.merge(charge_mappings[['id', 'tokenized_stemmed description']], how='outer', on='id')

In [154]:
cr_df_merged = c_df_merged.merge(r_charge_mappings[['id', 'r_tokenized_stemmed description']], how='outer', on='id')

In [155]:
merged_filt = cr_df_merged[['id', 'name', 'race', 'c_charge_desc', 'tokenized_stemmed description', 'two_year_recid', 'r_charge_desc', 'r_tokenized_stemmed description']]
merged_filt.head()

Unnamed: 0,id,name,race,c_charge_desc,tokenized_stemmed description,two_year_recid,r_charge_desc,r_tokenized_stemmed description
0,1,miguel hernandez,Other,Aggravated Assault w/Firearm,"aggrav, assault, firearm",0,,
1,3,kevon dixon,African-American,Felony Battery w/Prior Convict,"feloni, batteri, prior, convict",1,Felony Battery (Dom Strang),"feloni, batteri, domest, strangul"
2,4,ed philo,African-American,Possession of Cocaine,"possess, cocain",1,Driving Under The Influence,"drive, influenc"
3,5,marcu brown,African-American,Possession of Cannabis,"possess, cannabi",0,,
4,6,bouthy pierrelouis,Other,arrest case no charge,"arrest, case, charg",0,,


In [156]:
merged_filt.to_csv('merged_filt_NEW.csv')