In [1]:
import pandas as pd
import numpy as np

# Read data and assign NA to missing values 
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/00211/CommViolPredUnnormalizedData.txt',
                   sep='\s*,\s*',encoding='latin-1',engine='python', na_values=["?"])


data.columns = ['communityname','state','countyCode','communityCode','fold','population','householdsize','racepctblack',
           'racePctWhite','racePctAsian','racePctHisp','agePct12t21','agePct12t29','agePct16t24','agePct65up',
           'numbUrban','pctUrban','medIncome','pctWWage','pctWFarmSelf','pctWInvInc','pctWSocSec','pctWPubAsst',
           'pctWRetire','medFamInc','perCapInc','whitePerCap','blackPerCap','indianPerCap','AsianPerCap','OtherPerCap',
           'HispPerCap','NumUnderPov','PctPopUnderPov','PctLess9thGrade','PctNotHSGrad','PctBSorMore','PctUnemployed',
           'PctEmploy','PctEmplManu','PctEmplProfServ','PctOccupManu','PctOccupMgmtProf','MalePctDivorce',
           'MalePctNevMarr','FemalePctDiv','TotalPctDiv','PersPerFam','PctFam2Par','PctKids2Par','PctYoungKids2Par',
           'PctTeen2Par','PctWorkMomYoungKids','PctWorkMom','NumKidsBornNeverMar','PctKidsBornNeverMar','NumImmig',
           'PctImmigRecent','PctImmigRec5','PctImmigRec8','PctImmigRec10','PctRecentImmig','PctRecImmig5',
           'PctRecImmig8','PctRecImmig10','PctSpeakEnglOnly','PctNotSpeakEnglWell','PctLargHouseFam',
           'PctLargHouseOccup','PersPerOccupHous','PersPerOwnOccHous','PersPerRentOccHous','PctPersOwnOccup',
           'PctPersDenseHous','PctHousLess3BR','MedNumBR','HousVacant','PctHousOccup','PctHousOwnOcc','PctVacantBoarded',
           'PctVacMore6Mos','MedYrHousBuilt','PctHousNoPhone','PctWOFullPlumb','OwnOccLowQuart','OwnOccMedVal',
           'OwnOccHiQuart','OwnOccQrange','RentLowQ','RentMedian','RentHighQ','RentQrange','MedRent','MedRentPctHousInc',
           'MedOwnCostPctInc','MedOwnCostPctIncNoMtg','NumInShelters','NumStreet','PctForeignBorn','PctBornSameState',
           'PctSameHouse85','PctSameCity85','PctSameState85','LemasSwornFT','LemasSwFTPerPop','LemasSwFTFieldOps',
           'LemasSwFTFieldPerPop','LemasTotalReq','LemasTotReqPerPop','PolicReqPerOffic','PolicPerPop',
           'RacialMatchCommPol','PctPolicWhite','PctPolicBlack','PctPolicHisp','PctPolicAsian','PctPolicMinor',
           'OfficAssgnDrugUnits','NumKindsDrugsSeiz','PolicAveOTWorked','LandArea','PopDens','PctUsePubTrans',
           'PolicCars','PolicOperBudg','LemasPctPolicOnPatr','LemasGangUnitDeploy','LemasPctOfficDrugUn',
           'PolicBudgPerPop','murders','murdPerPop','rapes','rapesPerPop','robberies','robbbPerPop','assaults',
           'assaultPerPop','burglaries','burglPerPop','larcenies','larcPerPop','autoTheft','autoTheftPerPop','arsons',
           'arsonsPerPop','ViolentCrimesPerPop','nonViolPerPop']

# Select the relevant columns to use in the model 
cols_final = data[[
 'population',
 'racepctblack',
 'agePct12t29',
 'numbUrban',
 'medIncome',
 'pctWWage',
 'pctWInvInc',
 'medFamInc',
 'perCapInc',
 'whitePerCap',
 'PctEmploy',
 'MalePctDivorce',
 'MalePctNevMarr',
 'TotalPctDiv',
 'PctKids2Par',
 'PctWorkMom',
 'NumImmig',
 'PctRecImmig8',
 'PctRecImmig10',
 'PctLargHouseOccup',
 'PersPerOccupHous',
 'PersPerRentOccHous',
 'PctPersOwnOccup',
 'PctPersDenseHous',
 'HousVacant',
 'PctHousOwnOcc',
 'OwnOccLowQuart',
 'OwnOccMedVal',
 'RentLowQ',
 'RentMedian',
 'MedRent',
 'MedOwnCostPctIncNoMtg',
 'NumStreet',
 'ViolentCrimesPerPop']]

# Remove missing values
# First establish how much data is missing
cols_final.isnull().sum().sort_values(ascending=False).head()


# Impute the missing values using SimpleImputer in sklearn.impute
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(cols_final)
cols_final = pd.DataFrame(data=imp.transform(cols_final), columns=cols_final.columns)

# Add community name as a user
communityname = data[['communityname','population']]
cols_final = communityname.merge(cols_final)

# Check if there is still missing data
cols_final.isnull().sum().sort_values(ascending=False)

cols_final

Unnamed: 0,communityname,population,racepctblack,agePct12t29,numbUrban,medIncome,pctWWage,pctWInvInc,medFamInc,perCapInc,...,HousVacant,PctHousOwnOcc,OwnOccLowQuart,OwnOccMedVal,RentLowQ,RentMedian,MedRent,MedOwnCostPctIncNoMtg,NumStreet,ViolentCrimesPerPop
0,Marpletownship,23123,0.80,21.30,23123.0,47917.0,78.99,64.11,55323.0,20148.0,...,240.0,84.88,136300.0,164200.0,467.0,560.0,627.0,12.5,0.0,127.560000
1,Tigardcity,29344,0.74,25.88,29344.0,35669.0,82.00,55.73,42112.0,16946.0,...,544.0,57.79,74700.0,90400.0,370.0,428.0,484.0,11.6,0.0,218.590000
2,Gloversvillecity,16656,1.70,25.20,0.0,20580.0,68.15,38.95,26501.0,10810.0,...,669.0,54.89,36400.0,49600.0,195.0,250.0,333.0,14.5,0.0,306.640000
3,Bemidjicity,11245,0.53,40.53,0.0,17390.0,69.33,42.82,24018.0,8483.0,...,333.0,53.57,30600.0,43200.0,202.0,283.0,332.0,12.9,0.0,589.353914
4,Springfieldcity,140494,2.51,32.89,140494.0,21577.0,75.78,41.15,27705.0,11878.0,...,5119.0,55.50,37700.0,53900.0,215.0,280.0,340.0,11.7,4.0,442.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2331,Manvilleborough,10567,0.18,24.90,10567.0,37664.0,77.02,48.61,44587.0,16201.0,...,126.0,70.65,123900.0,144200.0,441.0,553.0,651.0,15.9,0.0,132.870000
2332,Mercedcity,56216,6.87,30.16,56216.0,24727.0,75.05,31.42,27388.0,10237.0,...,683.0,44.63,71200.0,91100.0,298.0,374.0,438.0,11.7,0.0,545.750000
2333,Pinevillecity,12251,21.18,31.23,12251.0,20321.0,75.06,33.25,25000.0,9995.0,...,523.0,54.24,33600.0,52000.0,176.0,248.0,330.0,14.4,0.0,124.100000
2334,Yucaipacity,32824,0.52,20.96,32824.0,27182.0,59.79,44.72,34973.0,14131.0,...,957.0,76.81,91700.0,123900.0,347.0,451.0,514.0,13.1,0.0,353.830000


In [2]:
cols_final['population'] = pd.qcut(cols_final['population'].values, 5, duplicates = 'drop').codes + 1
cols_final['racepctblack'] = pd.qcut(cols_final['racepctblack'].values, 5, duplicates = 'drop').codes + 1
cols_final['agePct12t29'] = pd.qcut(cols_final['agePct12t29'].values, 5, duplicates = 'drop').codes + 1
cols_final['numbUrban'] = pd.qcut(cols_final['numbUrban'].values, 5, duplicates = 'drop').codes + 1
cols_final['medIncome'] = pd.qcut(cols_final['medIncome'].values, 5, duplicates = 'drop').codes + 1
cols_final['pctWWage'] = pd.qcut(cols_final['pctWWage'].values, 5, duplicates = 'drop').codes + 1
cols_final['pctWInvInc'] = pd.qcut(cols_final['pctWInvInc'].values, 5, duplicates = 'drop').codes + 1
cols_final['medFamInc'] = pd.qcut(cols_final['medFamInc'].values, 5, duplicates = 'drop').codes + 1
cols_final['perCapInc'] = pd.qcut(cols_final['perCapInc'].values, 5, duplicates = 'drop').codes + 1
cols_final['whitePerCap'] = pd.qcut(cols_final['whitePerCap'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctEmploy'] = pd.qcut(cols_final['PctEmploy'].values, 5, duplicates = 'drop').codes + 1
cols_final['MalePctDivorce'] = pd.qcut(cols_final['MalePctDivorce'].values, 5, duplicates = 'drop').codes + 1
cols_final['MalePctNevMarr'] = pd.qcut(cols_final['MalePctNevMarr'].values, 5, duplicates = 'drop').codes + 1
cols_final['TotalPctDiv'] = pd.qcut(cols_final['TotalPctDiv'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctKids2Par'] = pd.qcut(cols_final['PctKids2Par'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctWorkMom'] = pd.qcut(cols_final['PctWorkMom'].values, 5, duplicates = 'drop').codes + 1
cols_final['NumImmig'] = pd.qcut(cols_final['NumImmig'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctRecImmig8'] = pd.qcut(cols_final['PctRecImmig8'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctRecImmig10'] = pd.qcut(cols_final['PctRecImmig10'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctLargHouseOccup'] = pd.qcut(cols_final['PctLargHouseOccup'].values, 5, duplicates = 'drop').codes + 1
cols_final['PersPerOccupHous'] = pd.qcut(cols_final['PersPerOccupHous'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctPersOwnOccup'] = pd.qcut(cols_final['PctPersOwnOccup'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctPersDenseHous'] = pd.qcut(cols_final['PctPersDenseHous'].values, 5, duplicates = 'drop').codes + 1
cols_final['HousVacant'] = pd.qcut(cols_final['HousVacant'].values, 5, duplicates = 'drop').codes + 1
cols_final['PctHousOwnOcc'] = pd.qcut(cols_final['PctHousOwnOcc'].values, 5, duplicates = 'drop').codes + 1
cols_final['OwnOccLowQuart'] = pd.qcut(cols_final['OwnOccLowQuart'].values, 5, duplicates = 'drop').codes + 1
cols_final['OwnOccMedVal'] = pd.qcut(cols_final['OwnOccMedVal'].values, 5, duplicates = 'drop').codes + 1
cols_final['RentLowQ'] = pd.qcut(cols_final['RentLowQ'].values, 5, duplicates = 'drop').codes + 1
cols_final['RentMedian'] = pd.qcut(cols_final['RentMedian'].values, 5, duplicates = 'drop').codes + 1
cols_final['MedRent'] = pd.qcut(cols_final['MedRent'].values, 5, duplicates = 'drop').codes + 1
cols_final['MedOwnCostPctIncNoMtg'] = pd.qcut(cols_final['MedOwnCostPctIncNoMtg'].values, 5, duplicates = 'drop').codes + 1
cols_final['NumStreet'] = pd.qcut(cols_final['NumStreet'].values, 5, duplicates = 'drop').codes + 1
cols_final['ViolentCrimesPerPop'] = pd.qcut(cols_final['ViolentCrimesPerPop'].values, 5, duplicates = 'drop').codes + 1
cols_final

Unnamed: 0,communityname,population,racepctblack,agePct12t29,numbUrban,medIncome,pctWWage,pctWInvInc,medFamInc,perCapInc,...,HousVacant,PctHousOwnOcc,OwnOccLowQuart,OwnOccMedVal,RentLowQ,RentMedian,MedRent,MedOwnCostPctIncNoMtg,NumStreet,ViolentCrimesPerPop
0,Marpletownship,3,2,1,2,5,3,5,5,5,...,1,5,5,4,5,4,4,3,1,1
1,Tigardcity,4,2,3,3,4,4,5,4,4,...,3,2,3,3,4,3,3,1,1,2
2,Gloversvillecity,2,3,2,1,1,1,2,1,1,...,3,2,1,1,1,1,1,5,1,2
3,Bemidjicity,1,1,5,1,1,1,3,1,1,...,2,2,1,1,2,2,1,3,1,3
4,Springfieldcity,5,3,5,4,1,2,3,1,2,...,5,2,1,2,2,2,1,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2331,Manvilleborough,1,1,2,1,4,3,4,4,4,...,1,4,4,4,4,4,4,5,1,1
2332,Mercedcity,5,4,5,4,2,2,1,1,1,...,3,1,3,3,3,3,3,1,1,3
2333,Pinevillecity,1,5,5,1,1,2,2,1,1,...,3,2,1,1,1,1,1,5,1,1
2334,Yucaipacity,4,1,1,3,2,1,3,3,3,...,4,5,4,4,3,3,3,3,1,3


In [3]:
# change to unpivoted table
data = cols_final.melt(id_vars=['communityname'], var_name='features', value_name='values')
data

Unnamed: 0,communityname,features,values
0,Marpletownship,population,3.0
1,Tigardcity,population,4.0
2,Gloversvillecity,population,2.0
3,Bemidjicity,population,1.0
4,Springfieldcity,population,5.0
...,...,...,...
79419,Manvilleborough,ViolentCrimesPerPop,1.0
79420,Mercedcity,ViolentCrimesPerPop,3.0
79421,Pinevillecity,ViolentCrimesPerPop,1.0
79422,Yucaipacity,ViolentCrimesPerPop,3.0


In [4]:
from surprise import NMF, SVD, SVDpp, KNNBasic, KNNWithMeans, KNNWithZScore, CoClustering
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data, reader)

cv = []
# Iterate over all recommender system algorithms
for recsys in [NMF(), SVD(), SVDpp(), KNNWithZScore(), CoClustering()]:
    # Perform cross validation
    tmp = cross_validate(recsys, data, measures=['RMSE'], cv=3, verbose=False)
    cv.append((str(recsys).split(' ')[0].split('.')[-1], tmp['test_rmse'].mean()))
pd.DataFrame(cv, columns=['RecSys', 'RMSE'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,RecSys,RMSE
0,NMF,0.957224
1,SVD,0.915878
2,SVDpp,0.911663
3,KNNWithZScore,0.924927
4,CoClustering,1.14736


In [5]:
# change to unpivoted table
data = cols_final.melt(id_vars=['communityname'], var_name='features', value_name='values')

#Split train and test data
split_value = int(len(cols_final) * 0.80)
data = data[:-468]

from surprise import SVDpp
from surprise import Reader, Dataset

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data, reader)

algo = SVDpp()
algo.fit(data.build_full_trainset())
crime_prediction = []
for communityname in cols_final['communityname'][split_value:]:
    crime_prediction.append((communityname, algo.predict(communityname,'ViolentCrimesPerPop').est))
    
pd.DataFrame(crime_prediction, columns=['communityname', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,communityname,predictions
78,PompanoBeachcity,4.948261
416,Pasadenacity,4.873198
426,Vicksburgcity,4.832372
101,Stocktoncity,4.728463
163,Plainfieldcity,4.726854
312,Shreveportcity,4.704287
11,SouthMiamicity,4.691577
297,Vallejocity,4.601689
240,Riversidecity,4.600496
385,FortMyerscity,4.595421
