**Install openclean.**

In [None]:
pip install openclean-core

**Cloning the data from github repo.**

In [None]:
import os
git_folder = 'NYC-Crime'
if not os.path.isdir(git_folder):
  !git clone https://github.com/duketran1996/NYC-Crime.git
else:
  %cd NYC-Crime/ 
  !git pull
  %cd ..

Cloning into 'NYC-Crime'...
remote: Enumerating objects: 152, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 152 (delta 73), reused 82 (delta 26), pack-reused 0[K
Receiving objects: 100% (152/152), 61.62 MiB | 2.37 MiB/s, done.
Resolving deltas: 100% (73/73), done.
Checking out files: 100% (22/22), done.


**Important import. Run before executing the rest**

In [None]:
from openclean.cluster.knn import knn_clusters, knn_collision_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.token.ngram import NGrams
from openclean.function.value.threshold import GreaterThan
from openclean.operator.transform.update import update

**Data study: List number of columns.**

In [None]:
from openclean.pipeline import stream

datafile = './NYC-Crime/sub-dataset/nypd_arrests_data_2016.csv'
ds = stream(datafile)

print('Schema\n------')
for col in ds.columns:
    print("  '{}'".format(col))
    
print('\n{} rows.'.format(ds.count()))
print("There are {} rows and {} columns in the dataset.".format(ds.count(),len(ds.columns)))

Schema
------
  'ARREST_KEY'
  'ARREST_DATE'
  'PD_CD'
  'PD_DESC'
  'KY_CD'
  'OFNS_DESC'
  'LAW_CODE'
  'LAW_CAT_CD'
  'ARREST_BORO'
  'ARREST_PRECINCT'
  'JURISDICTION_CODE'
  'AGE_GROUP'
  'PERP_SEX'
  'PERP_RACE'
  'X_COORD_CD'
  'Y_COORD_CD'
  'Latitude'
  'Longitude'
  'Lon_Lat'

314866 rows.
There are 314866 rows and 19 columns in the dataset.


**Data study: Profile a sample of 10000 data to detect issues.**

In [None]:
from openclean.profiling.column import DefaultColumnProfiler

profiles = ds.sample(n=10000, random_state=42).profile(default_profiler=DefaultColumnProfiler)

In [None]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
ARREST_KEY,10000,0,10000,1.0,13.287712
ARREST_DATE,10000,0,366,0.0366,8.446438
PD_CD,10000,0,179,0.0179,5.474215
PD_DESC,10000,32,182,0.018258,5.518198
KY_CD,10000,32,61,0.00612,4.625972
OFNS_DESC,10000,32,56,0.005618,4.393368
LAW_CODE,10000,0,439,0.0439,6.292323
LAW_CAT_CD,10000,47,4,0.000402,1.117516
ARREST_BORO,10000,0,5,0.0005,2.137182
ARREST_PRECINCT,10000,0,77,0.0077,6.081293


**Data study: Perform a scan to check age group. No issues found.**

In [None]:
date = ds.distinct('AGE_GROUP')
for i in date:
  print(i)

25-44
18-24
45-64
<18
65+
AGE_GROUP


**Data study: Perform a scan to check date format. No issues found.**

In [None]:
date = ds.distinct('ARREST_DATE')

import datetime 
def validate(date_text):
    try:
        datetime.datetime.strptime(date_text, '%m/%d/%Y')
    except ValueError:
        print(date_text)
        #raise ValueError("Incorrect data format, should be YYYY-MM-DD")

for i in date:
  validate(i)

ARREST_DATE


**Convert to data frame for fixing issues.**

In [None]:
fix = ds.to_df()

**Data Issues: In OFNS_DESC column, there are many repeated and miss spelling that needs to merge and fix. The impact with this change is later we would like to catergories offenses and have statistics on it.**

In [None]:
offense = ds.select('OFNS_DESC').distinct()

clusters = knn_clusters(
  values=offense,
  sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.7)),
  tokenizer=NGrams(n=4),
  minsize=2
)

for i in clusters:
  print(i)

Cluster({'CRIMINAL MISCHIEF & RELATED OFFENSES': 10935, 'CRIMINAL MISCHIEF & RELATED OF': 201})
Cluster({'INTOXICATED & IMPAIRED DRIVING': 6430, 'INTOXICATED/IMPAIRED DRIVING': 914})
Cluster({'POSSESSION OF STOLEN PROPERTY 5': 1221, 'POSSESSION OF STOLEN PROPERTY': 1013})
Cluster({'OTHER STATE LAWS (NON PENAL LAW)': 2786, 'OTHER STATE LAWS (NON PENAL LA': 622})
Cluster({'CHILD ABANDONMENT/NON SUPPORT 1': 29, 'CHILD ABANDONMENT/NON SUPPORT': 6})
Cluster({'ADMINISTRATIVE CODE': 3994, 'ADMINISTRATIVE CODES': 32})


**Data issues: Show RELATED OFFENSES spellings.**

In [None]:
offense = ds.select(['OFNS_DESC']).distinct()

offense_val = []

for i in offense:
  if 'CRIMINAL MISCHIEF' in i:
    offense_val.append(i)
    print(i)

CRIMINAL MISCHIEF & RELATED OFFENSES
CRIMINAL MISCHIEF & RELATED OF


**Data fix: Change CRIMINAL MISCHIEF & RELATED OF to CRIMINAL MISCHIEF & RELATED OFFENSES**

In [None]:
offense_dict = {
    'CRIMINAL MISCHIEF & RELATED OF': 'CRIMINAL MISCHIEF & RELATED OFFENSES'
}

fix = update(fix, columns='OFNS_DESC', func=offense_dict)

**Data fixed test: Test RELATED OFFENSES spellings**

In [None]:
check_fix = fix.loc[fix['OFNS_DESC'].isin(offense_val)]
assert all(check_fix['OFNS_DESC'] == 'CRIMINAL MISCHIEF & RELATED OFFENSES'), "RELATED OFFENSES spelling is not fixed: " + check_fix['OFNS_DESC'].unique()
print("Successfully fixed: " + check_fix['OFNS_DESC'].unique())

['Successfully fixed: CRIMINAL MISCHIEF & RELATED OFFENSES']


**Data issues: Show INTOXICATED spellings.**

In [None]:
offense = ds.select('OFNS_DESC').distinct()
tox_val = []
for i in offense:
  if 'INTOXICATED' in i:
    tox_val.append(i)
    print(i)

INTOXICATED/IMPAIRED DRIVING
INTOXICATED & IMPAIRED DRIVING


**Data fix: Change INTOXICATED & IMPAIRED DRIVING, INTOXICATED/IMPAIRED DRIVING to INTOXICATED AND IMPAIRED DRIVING**

In [None]:
tox_dict = {
    'INTOXICATED & IMPAIRED DRIVING': 'INTOXICATED AND IMPAIRED DRIVING',
    'INTOXICATED/IMPAIRED DRIVING': 'INTOXICATED AND IMPAIRED DRIVING'
}

fix = update(fix, columns='OFNS_DESC', func=tox_dict)

**Data fixed test: Test INTOXICATED spellings**

In [None]:
check_fix = fix.loc[fix['OFNS_DESC'].isin(['INTOXICATED AND IMPAIRED DRIVING'] + tox_val)]
assert all(check_fix['OFNS_DESC'] == 'INTOXICATED AND IMPAIRED DRIVING'), "INTOXICATED spelling is not fixed: " + check_fix['OFNS_DESC'].unique()
print("Successfully fixed: " + check_fix['OFNS_DESC'].unique())

['Successfully fixed: INTOXICATED AND IMPAIRED DRIVING']


**Data issues: Show POSSESSION OF STOLEN PROPERTY spellings.**

In [None]:
offense = ds.select('OFNS_DESC').distinct()
stolen_val = []
for i in offense:
  if 'POSSESSION' in i:
    stolen_val.append(i)
    print(i)

POSSESSION OF STOLEN PROPERTY 5
POSSESSION OF STOLEN PROPERTY


**Data fix: Change POSSESSION OF STOLEN PROPERTY 5 to POSSESSION OF STOLEN PROPERTY**

In [None]:
tox_dict = {
    'POSSESSION OF STOLEN PROPERTY 5': 'POSSESSION OF STOLEN PROPERTY'
}

fix = update(fix, columns='OFNS_DESC', func=tox_dict)

**Data fixed test: Test POSSESSION OF STOLEN PROPERTY 5 spellings**

In [None]:
check_fix = fix.loc[fix['OFNS_DESC'].isin(stolen_val)]
assert all(check_fix['OFNS_DESC'] == 'POSSESSION OF STOLEN PROPERTY'), "POSSESSION OF STOLEN PROPERTY spelling is not fixed: " + check_fix['OFNS_DESC'].unique()
print("Successfully fixed: " + check_fix['OFNS_DESC'].unique())

['Successfully fixed: POSSESSION OF STOLEN PROPERTY']


**Data issues: Show OTHER STATE LAWS (NON PENAL LAW) spellings.**

In [None]:
offense = ds.select('OFNS_DESC').distinct()
penal_val = []
for i in offense:
  if 'NON PENAL' in i:
    penal_val.append(i)
    print(i)

OTHER STATE LAWS (NON PENAL LAW)
OTHER STATE LAWS (NON PENAL LA


**Data fix: Change OTHER STATE LAWS (NON PENAL LA to OTHER STATE LAWS (NON PENAL LAW)**

In [None]:
penal_dict = {
    'OTHER STATE LAWS (NON PENAL LA' : 'OTHER STATE LAWS (NON PENAL LAW)'
}

fix = update(fix, columns='OFNS_DESC', func=penal_dict)

**Data fixed test: Test OTHER STATE LAWS (NON PENAL LAW) spellings**

In [None]:
check_fix = fix.loc[fix['OFNS_DESC'].isin(penal_val)]
assert all(check_fix['OFNS_DESC'] == 'OTHER STATE LAWS (NON PENAL LAW)'), "OTHER STATE LAWS (NON PENAL LAW) spelling is not fixed: " + check_fix['OFNS_DESC'].unique()
print("Successfully fixed: " + check_fix['OFNS_DESC'].unique())

['Successfully fixed: OTHER STATE LAWS (NON PENAL LAW)']


**Data issues: Show CHILD ABANDONMENT/NON SUPPORT spellings.**

In [None]:
offense = ds.select(['OFNS_DESC']).distinct()

child_val = []

for i in offense:
  if 'CHILD ABANDONMENT' in i:
    child_val.append(i)
    print(i)

CHILD ABANDONMENT/NON SUPPORT
CHILD ABANDONMENT/NON SUPPORT 1


**Data fix: Change CHILD ABANDONMENT/NON SUPPORT 1 to CHILD ABANDONMENT/NON SUPPORT**

In [None]:
child_dict = {
    'CHILD ABANDONMENT/NON SUPPORT 1': 'CHILD ABANDONMENT/NON SUPPORT'
}

fix = update(fix, columns='OFNS_DESC', func=child_dict)

**Data fixed test: Test CHILD ABANDONMENT/NON SUPPORT spellings**

In [None]:
check_fix = fix.loc[fix['OFNS_DESC'].isin(child_val)]
assert all(check_fix['OFNS_DESC'] == 'CHILD ABANDONMENT/NON SUPPORT'), "CHILD ABANDONMENT/NON SUPPORT spelling is not fixed: " + check_fix['OFNS_DESC'].unique()
print("Successfully fixed: " + check_fix['OFNS_DESC'].unique())

['Successfully fixed: CHILD ABANDONMENT/NON SUPPORT']


**Data issues: Show ADMINISTRATIVE spellings.**

In [None]:
offense = ds.select('OFNS_DESC').distinct()
administrative_val = []
for i in offense:
  if 'ADMINISTRATIVE' in i:
    administrative_val.append(i)
    print(i)

ADMINISTRATIVE CODE
ADMINISTRATIVE CODES


**Data fix: Change ADMINISTRATIVE CODES to ADMINISTRATIVE CODE**

In [None]:
ad_dict = {
    'ADMINISTRATIVE CODES': 'ADMINISTRATIVE CODE',
}

fix = update(fix, columns='OFNS_DESC', func=ad_dict)

**Data fixed test: Test ADMINSTRATIVE spellings**

In [None]:
check_fix = fix.loc[fix['OFNS_DESC'].isin(administrative_val)]
assert all(check_fix['OFNS_DESC'] == 'ADMINISTRATIVE CODE'), "ADMINISTRATIVE spelling is not fixed: " + check_fix['OFNS_DESC'].unique()
print("Successfully fixed: " + check_fix['OFNS_DESC'].unique())

['Successfully fixed: ADMINISTRATIVE CODE']


**Data issues: The borough of NYC that the arrest happen. The data K,M,B,Q,S is unclear to us.**

In [None]:
print(fix['ARREST_BORO'].unique())

['K' 'B' 'M' 'Q' 'S' 'ARREST_BORO']


**Data fix: Change ambiguous abbreviation of column ARREST_BORO to full form.**

In [None]:
boro_dict = {
    'B': 'Bronx',
    'S': 'Staten Island',
    'K': 'Brooklyn',
    'M': 'Manhattan',
    'Q': 'Queens',
    'ARREST_BORO': 'ARREST_BORO'
}

fix = update(fix, columns='ARREST_BORO', func=boro_dict)

**Data fixed test: Test ARREST_BORO fixed data**

In [None]:
assert (sorted(fix['ARREST_BORO'].unique()) == sorted(boro_dict.values())), "ARREST_BORO is not fixed: " + fix['ARREST_BORO'].unique()
print("Successfully fixed: " + fix['ARREST_BORO'].unique())

['Successfully fixed: Brooklyn' 'Successfully fixed: Bronx'
 'Successfully fixed: Manhattan' 'Successfully fixed: Queens'
 'Successfully fixed: Staten Island' 'Successfully fixed: ARREST_BORO']


**Data issues: The columns PERP_SEX and LAW_CAT_CD are also having values that is easier to read if written in full text instead of abbreviation.**

In [None]:
sex = ds.distinct('PERP_SEX')

print(list(sex))

['M', 'F', 'PERP_SEX']


In [None]:
law_cat_cd = ds.distinct('LAW_CAT_CD')

print(list(law_cat_cd))

['F', 'M', 'V', 'I', 'LAW_CAT_CD', '']


**Data fix: Change abbreviation of LAW_CAT_CD to long form.**

In [None]:
law_cat_cd_dict = {
    'F': 'Felony',
    'M': 'Misdemeanor',
    'V': 'Violation',
    'I': 'Traffic Infraction',
    '': 'Unknown',
    'LAW_CAT_CD': 'LAW_CAT_CD'
}

fix = update(fix, columns='LAW_CAT_CD', func=law_cat_cd_dict)

**Data fixed test: Test LAW_CAT_CD fixed data**

In [None]:
assert (sorted(fix['LAW_CAT_CD'].unique()) == sorted(law_cat_cd_dict.values())), "LAW_CAT_CD is not fixed: " + fix['LAW_CAT_CD'].unique()
print("Successfully fixed: " + fix['LAW_CAT_CD'].unique())

['Successfully fixed: Felony' 'Successfully fixed: Misdemeanor'
 'Successfully fixed: Violation' 'Successfully fixed: Traffic Infraction'
 'Successfully fixed: LAW_CAT_CD' 'Successfully fixed: Unknown']


**Data fix: Change abbreviation of PERP_SEX to long form.**

In [None]:
perp_sex_dict = {
    'F': 'Female',
    'M': 'Male',
    'PERP_SEX': 'PERP_SEX'
}

fix = update(fix, columns='PERP_SEX', func=perp_sex_dict)

**Data fixed test: Test PERP_SEX fixed data**

In [None]:
assert (sorted(fix['PERP_SEX'].unique()) == sorted(perp_sex_dict.values())), "PERP_SEX is not fixed: " + fix['PERP_SEX'].unique()
print("Successfully fixed: " + fix['PERP_SEX'].unique())

['Successfully fixed: Male' 'Successfully fixed: Female'
 'Successfully fixed: PERP_SEX']


**Data issues: There are unnecessary columns in our dataset that we don't care about such as X_COORD_CD and Y_COORD_CD which list midblock X and Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104)**

In [None]:
display = ds.select(['X_COORD_CD','Y_COORD_CD']).to_df()

display.head()

Unnamed: 0,X_COORD_CD,Y_COORD_CD
0,998032.0,175598.0
1,1032047.0,242037.0
2,1008114.0,244866.0
3,999358.0,236472.0
4,987078.0,215157.0


**Data fix: Our solution is to drop the columns.**

In [None]:
fix  = fix.drop(columns=['X_COORD_CD', 'Y_COORD_CD'])

**Data fixed test: Test X_COORD_CD and YCOORD_CD dropped column**

In [None]:
assert (any(i not in fix.columns.values.tolist() for i in ['X_COORD_CD', 'Y_COORD_CD'])), "X_COORD_CD and Y_COORD_CD are not dropped"
print("Successfully dropped: " + str(fix.columns.values.tolist()))

Successfully dropped: ['ARREST_KEY', 'ARREST_DATE', 'PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC', 'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT', 'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'Latitude', 'Longitude', 'Lon_Lat']


**Data issues: The data ASIAN / PACIFIC ISLANDER is better to be fix by removing space between / for easier comparison for analysis later on.**

In [None]:
race = ds.distinct('PERP_RACE')

for i in race:
  print(i)

BLACK
WHITE
UNKNOWN
WHITE HISPANIC
BLACK HISPANIC
ASIAN / PACIFIC ISLANDER
PERP_RACE
AMERICAN INDIAN/ALASKAN NATIVE


**Data fix: Remove space between ASIAN / PACIFIC ISLANDER.**

In [None]:
race_dict = {
    'ASIAN / PACIFIC ISLANDER': 'ASIAN/PACIFIC ISLANDER'
}

fix = update(fix, columns='PERP_RACE', func=race_dict)

**Data fixed test: Test PERP_RACE fixed data**

In [None]:
assert ('ASIAN / PACIFIC ISLANDER' not in fix['PERP_RACE'].unique()), "ASIAN / PACIFIC ISLANDER is not fixed"
print("Successfully fixed: " + fix['PERP_RACE'].unique())

['Successfully fixed: BLACK' 'Successfully fixed: WHITE'
 'Successfully fixed: UNKNOWN' 'Successfully fixed: WHITE HISPANIC'
 'Successfully fixed: BLACK HISPANIC'
 'Successfully fixed: ASIAN/PACIFIC ISLANDER'
 'Successfully fixed: PERP_RACE'
 'Successfully fixed: AMERICAN INDIAN/ALASKAN NATIVE']


**Data issues: Found new issues with PD_DESC. Some spellings are incorrect. This also impacts as we want to catergorize the PD description to compare with the offense description.**

In [None]:
pd = ds.select('PD_DESC').distinct()

clusters = knn_clusters(
  values=pd,
  sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.9)),
  tokenizer=NGrams(n=4),
  minsize=2
)

for i in clusters:
  print(i)

Cluster({'ROBBERY,UNCLASSIFIED,OPEN AREAS': 10170, 'ROBBERY,UNCLASSIFIED,OPEN AREA': 23})
Cluster({'TRAFFIC,UNCLASSIFIED MISDEMEAN': 15231, 'TRAFFIC,UNCLASSIFIED MISDEMEANOR': 3726})
Cluster({'ADM.CODE,UNCLASSIFIED VIOLATION': 2943, 'ADM.CODE,UNCLASSIFIED VIOLATIO': 555})
Cluster({'TRAFFIC,UNCLASSIFIED INFRACTION': 5120, 'TRAFFIC,UNCLASSIFIED INFRACTIO': 206})
Cluster({'NY STATE LAWS,UNCLASSIFIED FELONY': 1292, 'NY STATE LAWS,UNCLASSIFIED FEL': 239})
Cluster({'FRAUD,UNCLASSIFIED-MISDEMEANOR,PART 1': 98, 'FRAUD,UNCLASSIFIED-MISDEMEANOR-PART 2': 1})
Cluster({'NY STATE LAWS,UNCLASSIFIED VIO': 25, 'NY STATE LAWS,UNCLASSIFIED MIS': 2})
Cluster({'GENERAL BUSINESS LAW / UNCLASSIFIED': 17, 'GENERAL BUSINESS LAW,UNCLASSIFIED': 10})
Cluster({'UNAUTHORIZED USE VEHICLE 3': 902, 'UNAUTHORIZED USE VEHICLE 2': 313})
Cluster({'IMPERSONATION 2, PUBLIC SERVANT': 1317, 'IMPERSONATION 2, PUBLIC SERVAN': 3})
Cluster({'SOLICITATION 4, CRIMINAL': 4, 'SOLICITATION 5,CRIMINAL': 1})
Cluster({'CONTROLLED SUBSTAN

**Only miss spelling one is considered to be fixed such as: ROBBERY,UNCLASSIFIED,OPEN AREAS, TRAFFIC,UNCLASSIFIED MISDEMEAN, ADM.CODE,UNCLASSIFIED VIOLATIO, TRAFFIC,UNCLASSIFIED INFRACTIO, NY STATE LAWS,UNCLASSIFIED FEL, IMPERSONATION 2, PUBLIC SERVAN, CRIMINAL DISPOSAL FIREARM 1 &**

In [None]:
pd_desc = ds.select('PD_DESC').distinct()

errors_list = [ 'ROBBERY,UNCLASSIFIED,OPEN AREA', 'TRAFFIC,UNCLASSIFIED MISDEMEAN', 'ADM.CODE,UNCLASSIFIED VIOLATIO', 'TRAFFIC,UNCLASSIFIED INFRACTIO', 'NY STATE LAWS,UNCLASSIFIED FEL', 'IMPERSONATION 2, PUBLIC SERVAN', 'CRIMINAL DISPOSAL FIREARM 1 &' ]
for i in pd_desc:
  if any([e in i for e in errors_list]) :
    print(i)

ROBBERY,UNCLASSIFIED,OPEN AREAS
IMPERSONATION 2, PUBLIC SERVANT
TRAFFIC,UNCLASSIFIED MISDEMEAN
ADM.CODE,UNCLASSIFIED VIOLATION
TRAFFIC,UNCLASSIFIED MISDEMEANOR
ADM.CODE,UNCLASSIFIED VIOLATIO
TRAFFIC,UNCLASSIFIED INFRACTION
ROBBERY,UNCLASSIFIED,OPEN AREA
NY STATE LAWS,UNCLASSIFIED FEL
NY STATE LAWS,UNCLASSIFIED FELONY
TRAFFIC,UNCLASSIFIED INFRACTIO
CRIMINAL DISPOSAL FIREARM 1 & 2
CRIMINAL DISPOSAL FIREARM 1 &
IMPERSONATION 2, PUBLIC SERVAN


**Data fix: Change spellings of ROBBERY,UNCLASSIFIED,OPEN AREAS, TRAFFIC,UNCLASSIFIED MISDEMEAN, ADM.CODE,UNCLASSIFIED VIOLATIO, TRAFFIC,UNCLASSIFIED INFRACTIO, NY STATE LAWS,UNCLASSIFIED FEL, IMPERSONATION 2, PUBLIC SERVAN, CRIMINAL DISPOSAL FIREARM 1 &**

In [None]:
mix_dict = {
    'ROBBERY,UNCLASSIFIED,OPEN AREAS': 'ROBBERY,UNCLASSIFIED,OPEN AREA',
    'TRAFFIC,UNCLASSIFIED MISDEMEAN': 'TRAFFIC,UNCLASSIFIED MISDEMEANOR',
    'ADM.CODE,UNCLASSIFIED VIOLATIO': 'ADM.CODE,UNCLASSIFIED VIOLATION',
    'TRAFFIC,UNCLASSIFIED INFRACTIO': 'TRAFFIC,UNCLASSIFIED INFRACTION',
    'NY STATE LAWS,UNCLASSIFIED FEL': 'NY STATE LAWS,UNCLASSIFIED FELONY',
    'IMPERSONATION 2, PUBLIC SERVAN': 'IMPERSONATION 2, PUBLIC SERVANT',
    'CRIMINAL DISPOSAL FIREARM 1 &': 'CRIMINAL DISPOSAL FIREARM 1',
}

fix = update(fix, columns='PD_DESC', func=mix_dict)

**Data fixed test: Test PD_DESC fixed data**

In [None]:
check_fix = fix[fix['PD_DESC'].str.contains('|'.join(mix_dict.keys()))]
assert (any(i not in check_fix['PD_DESC'].unique() for i in mix_dict.keys())), "PD_DESC is not fixed: " + check_fix['PD_DESC'].unique()
print("Successfully fixed: " + check_fix['PD_DESC'].unique())

['Successfully fixed: IMPERSONATION 2, PUBLIC SERVANT'
 'Successfully fixed: TRAFFIC,UNCLASSIFIED MISDEMEANOR'
 'Successfully fixed: ADM.CODE,UNCLASSIFIED VIOLATION'
 'Successfully fixed: TRAFFIC,UNCLASSIFIED INFRACTION'
 'Successfully fixed: NY STATE LAWS,UNCLASSIFIED FELONY'
 'Successfully fixed: CRIMINAL DISPOSAL FIREARM 1 & 2']


**Only miss spelling one is considered to be fixed such as: CONTROLLED SUBSTANCE, POSSESSI, CONTROLLED SUBSTANCE, INTENT T, CONTROLLED SUBSTANCE,POSSESS., and spaces between CONTROLLED SUBSTANCE, SALE**

In [None]:
pd_desc = ds.select('PD_DESC').distinct()

for i in pd_desc:
  if 'CONTROLLED SUBSTANCE,' in i:
    print(i)

CONTROLLED SUBSTANCE,SALE 3
CONTROLLED SUBSTANCE, POSSESSION 7
CONTROLLED SUBSTANCE,POSSESS. 2
CONTROLLED SUBSTANCE,INTENT TO SELL 3
CONTROLLED SUBSTANCE, POSSESSION 5
CONTROLLED SUBSTANCE, INTENT TO SELL 5
CONTROLLED SUBSTANCE, SALE 4
CONTROLLED SUBSTANCE, POSSESSION 4
CONTROLLED SUBSTANCE,SALE 2
CONTROLLED SUBSTANCE,POSSESS. 1
CONTROLLED SUBSTANCE,SALE 1
CONTROLLED SUBSTANCE,POSSESS. 3
CONTROLLED SUBSTANCE, SALE 5
CONTROLLED SUBSTANCE,POSSESS. OF PROCURSERS
CONTROLLED SUBSTANCE, POSSESSI


**Data fix: Change spacing in SALE and spelling to POSSESSION and INTENT**



In [None]:
control_dict = {
    'CONTROLLED SUBSTANCE, POSSESSI': 'CONTROLLED SUBSTANCE, POSSESSION',
    'CONTROLLED SUBSTANCE,POSSESS. OF PROCURSERS': 'CONTROLLED SUBSTANCE, POSSESSION OF PROCURSERS',
    'CONTROLLED SUBSTANCE,POSSESS. 1': 'CONTROLLED SUBSTANCE, POSSESSION 1',
    'CONTROLLED SUBSTANCE,POSSESS. 2': 'CONTROLLED SUBSTANCE, POSSESSION 2',
    'CONTROLLED SUBSTANCE,POSSESS. 3': 'CONTROLLED SUBSTANCE, POSSESSION 3',

    'CONTROLLED SUBSTANCE,INTENT TO SELL 3': 'CONTROLLED SUBSTANCE, INTENT TO SELL 3',
    
    'CONTROLLED SUBSTANCE,SALE 1': 'CONTROLLED SUBSTANCE, SALE 1',
    'CONTROLLED SUBSTANCE,SALE 2': 'CONTROLLED SUBSTANCE, SALE 2',
    'CONTROLLED SUBSTANCE,SALE 3': 'CONTROLLED SUBSTANCE, SALE 3',
}

fix = update(fix, columns='PD_DESC', func=control_dict)

**Data fixed test: Test PD_DESC fixed data**

In [None]:
check_fix = fix[fix['PD_DESC'].str.contains('CONTROLLED SUBSTANCE,')]
assert (any(i not in check_fix['PD_DESC'].unique() for i in control_dict.keys())), "PD_DESC is not fixed: " + check_fix['PD_DESC'].unique()
print("Successfully fixed: " + check_fix['PD_DESC'].unique())

['Successfully fixed: CONTROLLED SUBSTANCE, SALE 3'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION 7'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION 2'
 'Successfully fixed: CONTROLLED SUBSTANCE, INTENT TO SELL 3'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION 5'
 'Successfully fixed: CONTROLLED SUBSTANCE, INTENT TO SELL 5'
 'Successfully fixed: CONTROLLED SUBSTANCE, SALE 4'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION 4'
 'Successfully fixed: CONTROLLED SUBSTANCE, SALE 2'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION 1'
 'Successfully fixed: CONTROLLED SUBSTANCE, SALE 1'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION 3'
 'Successfully fixed: CONTROLLED SUBSTANCE, SALE 5'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION OF PROCURSERS'
 'Successfully fixed: CONTROLLED SUBSTANCE, POSSESSION']


**Only fix DRUG spelling**

In [None]:
pd_desc = ds.select('PD_DESC').distinct()

for i in pd_desc:
  if 'IMPAIRED DRIVING' in i:
    print(i)

IMPAIRED DRIVING, DRUGS
IMPAIRED DRIVING,DRUG
IMPAIRED DRIVING,ALCOHOL


**Data fix: Fix DRUG spelling.**

In [None]:
impair_dict = {
    'IMPAIRED DRIVING, DRUGS': 'IMPAIRED DRIVING / DRUG',
    'IMPAIRED DRIVING,DRUG': 'IMPAIRED DRIVING / DRUG',
    'IMPAIRED DRIVING,ALCOHOL': 'IMPAIRED DRIVING / ALCOHOL'
}

fix = update(fix, columns='PD_DESC', func=impair_dict)

**Data fixed test: Test PD_DESC fixed data**

In [None]:
check_fix = fix[fix['PD_DESC'].str.contains('IMPAIRED DRIVING')]
assert (all(i not in check_fix['PD_DESC'].unique() for i in impair_dict.keys())), "PD_DESC is not fixed: " + check_fix['PD_DESC'].unique()
print("Successfully fixed: " + check_fix['PD_DESC'].unique())

['Successfully fixed: IMPAIRED DRIVING / DRUG'
 'Successfully fixed: IMPAIRED DRIVING / ALCOHOL']


**Finalize data set: Save data clean file to csv file for analysis.**

In [None]:
import os

existing_file = './NYC-Crime/clean-dataset/nypd_arrest_data_clean_2016.csv'
if os.path.isdir(existing_file):
  !rm $existing_file

In [None]:
fix.to_csv(r'./NYC-Crime/clean-dataset/nypd_arrest_data_clean_2016.csv')

**Update clean dataset 2016 to Github repo.**

In [None]:
%cd NYC-Crime/

!git config --global user.email "email"
!git config --global user.name "username"

!git add .
!git commit -m 'fix: update clean dataset 2016'
!git status

/content/NYC-Crime
[main 9c9ad72] fix: update clean dataset 2016
 1 file changed, 33 insertions(+), 33 deletions(-)
On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


**Assign github credentials**

In [None]:
!git remote add colab https://username:access-token@github.com/duketran1996/NYC-Crime.git

**Push file changes**

In [None]:
!git push -u colab main

Counting objects: 4, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 2.91 KiB | 2.91 MiB/s, done.
Total 4 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/duketran1996/NYC-Crime.git
   b2fa0f9..9c9ad72  main -> main
Branch 'main' set up to track remote branch 'main' from 'colab'.


**Remove Github repo folder**

In [None]:
%cd ../

!rm -r NYC-Crime

/content
