In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read data and assign NA to missing values 
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/00211/CommViolPredUnnormalizedData.txt',
                   sep='\s*,\s*',encoding='latin-1',engine='python', na_values=["?"])


data.columns = ['communityname','state','countyCode','communityCode','fold','population','householdsize','racepctblack',
           'racePctWhite','racePctAsian','racePctHisp','agePct12t21','agePct12t29','agePct16t24','agePct65up',
           'numbUrban','pctUrban','medIncome','pctWWage','pctWFarmSelf','pctWInvInc','pctWSocSec','pctWPubAsst',
           'pctWRetire','medFamInc','perCapInc','whitePerCap','blackPerCap','indianPerCap','AsianPerCap','OtherPerCap',
           'HispPerCap','NumUnderPov','PctPopUnderPov','PctLess9thGrade','PctNotHSGrad','PctBSorMore','PctUnemployed',
           'PctEmploy','PctEmplManu','PctEmplProfServ','PctOccupManu','PctOccupMgmtProf','MalePctDivorce',
           'MalePctNevMarr','FemalePctDiv','TotalPctDiv','PersPerFam','PctFam2Par','PctKids2Par','PctYoungKids2Par',
           'PctTeen2Par','PctWorkMomYoungKids','PctWorkMom','NumKidsBornNeverMar','PctKidsBornNeverMar','NumImmig',
           'PctImmigRecent','PctImmigRec5','PctImmigRec8','PctImmigRec10','PctRecentImmig','PctRecImmig5',
           'PctRecImmig8','PctRecImmig10','PctSpeakEnglOnly','PctNotSpeakEnglWell','PctLargHouseFam',
           'PctLargHouseOccup','PersPerOccupHous','PersPerOwnOccHous','PersPerRentOccHous','PctPersOwnOccup',
           'PctPersDenseHous','PctHousLess3BR','MedNumBR','HousVacant','PctHousOccup','PctHousOwnOcc','PctVacantBoarded',
           'PctVacMore6Mos','MedYrHousBuilt','PctHousNoPhone','PctWOFullPlumb','OwnOccLowQuart','OwnOccMedVal',
           'OwnOccHiQuart','OwnOccQrange','RentLowQ','RentMedian','RentHighQ','RentQrange','MedRent','MedRentPctHousInc',
           'MedOwnCostPctInc','MedOwnCostPctIncNoMtg','NumInShelters','NumStreet','PctForeignBorn','PctBornSameState',
           'PctSameHouse85','PctSameCity85','PctSameState85','LemasSwornFT','LemasSwFTPerPop','LemasSwFTFieldOps',
           'LemasSwFTFieldPerPop','LemasTotalReq','LemasTotReqPerPop','PolicReqPerOffic','PolicPerPop',
           'RacialMatchCommPol','PctPolicWhite','PctPolicBlack','PctPolicHisp','PctPolicAsian','PctPolicMinor',
           'OfficAssgnDrugUnits','NumKindsDrugsSeiz','PolicAveOTWorked','LandArea','PopDens','PctUsePubTrans',
           'PolicCars','PolicOperBudg','LemasPctPolicOnPatr','LemasGangUnitDeploy','LemasPctOfficDrugUn',
           'PolicBudgPerPop','murders','murdPerPop','rapes','rapesPerPop','robberies','robbbPerPop','assaults',
           'assaultPerPop','burglaries','burglPerPop','larcenies','larcPerPop','autoTheft','autoTheftPerPop','arsons',
           'arsonsPerPop','ViolentCrimesPerPop','nonViolPerPop']

print(data)

           communityname state  countyCode  communityCode  fold  population  \
0         Marpletownship    PA        45.0        47616.0     1       23123   
1             Tigardcity    OR         NaN            NaN     1       29344   
2       Gloversvillecity    NY        35.0        29443.0     1       16656   
3            Bemidjicity    MN         7.0         5068.0     1       11245   
4        Springfieldcity    MO         NaN            NaN     1      140494   
...                  ...   ...         ...            ...   ...         ...   
2209          Mercedcity    CA         NaN            NaN    10       56216   
2210       Pinevillecity    LA         NaN            NaN    10       12251   
2211         Yucaipacity    CA         NaN            NaN    10       32824   
2212        Beevillecity    TX         NaN            NaN    10       13547   
2213  WestSacramentocity    CA         NaN            NaN    10       28898   

      householdsize  racepctblack  racePctWhite  ra

In [3]:
# Select the relevant columns to use in the model 
cols_final = data[[
 'population',
 'racepctblack',
 'agePct12t29',
 'numbUrban',
 'medIncome',
 'pctWWage',
 'pctWInvInc',
 'medFamInc',
 'perCapInc',
 'whitePerCap',
 'PctEmploy',
 'MalePctDivorce',
 'MalePctNevMarr',
 'TotalPctDiv',
 'PctKids2Par',
 'PctWorkMom',
 'NumImmig',
 'PctRecImmig8',
 'PctRecImmig10',
 'PctLargHouseOccup',
 'PersPerOccupHous',
 'PersPerRentOccHous',
 'PctPersOwnOccup',
 'PctPersDenseHous',
 'HousVacant',
 'PctHousOwnOcc',
 'OwnOccLowQuart',
 'OwnOccMedVal',
 'RentLowQ',
 'RentMedian',
 'MedRent',
 'MedOwnCostPctIncNoMtg',
 'NumStreet',
 'ViolentCrimesPerPop']]

In [4]:
# Remove missing values
# First establish how much data is missing
cols_final.isnull().sum().sort_values(ascending=False).head()


# Impute the missing values using SimpleImputer in sklearn.impute
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(cols_final)
cols_final = pd.DataFrame(data=imp.transform(cols_final), columns=cols_final.columns)

# Add community name as a user
communityname = data[['communityname','population']]
cols_final = communityname.merge(cols_final)

# Check if there is still missing data
cols_final.isnull().sum().sort_values(ascending=False)

cols_final

Unnamed: 0,communityname,population,racepctblack,agePct12t29,numbUrban,medIncome,pctWWage,pctWInvInc,medFamInc,perCapInc,...,HousVacant,PctHousOwnOcc,OwnOccLowQuart,OwnOccMedVal,RentLowQ,RentMedian,MedRent,MedOwnCostPctIncNoMtg,NumStreet,ViolentCrimesPerPop
0,Marpletownship,23123,0.80,21.30,23123.0,47917.0,78.99,64.11,55323.0,20148.0,...,240.0,84.88,136300.0,164200.0,467.0,560.0,627.0,12.5,0.0,127.560000
1,Tigardcity,29344,0.74,25.88,29344.0,35669.0,82.00,55.73,42112.0,16946.0,...,544.0,57.79,74700.0,90400.0,370.0,428.0,484.0,11.6,0.0,218.590000
2,Gloversvillecity,16656,1.70,25.20,0.0,20580.0,68.15,38.95,26501.0,10810.0,...,669.0,54.89,36400.0,49600.0,195.0,250.0,333.0,14.5,0.0,306.640000
3,Bemidjicity,11245,0.53,40.53,0.0,17390.0,69.33,42.82,24018.0,8483.0,...,333.0,53.57,30600.0,43200.0,202.0,283.0,332.0,12.9,0.0,589.353914
4,Springfieldcity,140494,2.51,32.89,140494.0,21577.0,75.78,41.15,27705.0,11878.0,...,5119.0,55.50,37700.0,53900.0,215.0,280.0,340.0,11.7,4.0,442.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2331,Manvilleborough,10567,0.18,24.90,10567.0,37664.0,77.02,48.61,44587.0,16201.0,...,126.0,70.65,123900.0,144200.0,441.0,553.0,651.0,15.9,0.0,132.870000
2332,Mercedcity,56216,6.87,30.16,56216.0,24727.0,75.05,31.42,27388.0,10237.0,...,683.0,44.63,71200.0,91100.0,298.0,374.0,438.0,11.7,0.0,545.750000
2333,Pinevillecity,12251,21.18,31.23,12251.0,20321.0,75.06,33.25,25000.0,9995.0,...,523.0,54.24,33600.0,52000.0,176.0,248.0,330.0,14.4,0.0,124.100000
2334,Yucaipacity,32824,0.52,20.96,32824.0,27182.0,59.79,44.72,34973.0,14131.0,...,957.0,76.81,91700.0,123900.0,347.0,451.0,514.0,13.1,0.0,353.830000


Dimensionality Reduction - Principal Component Analysis (PCA)

In [5]:
X = data.iloc[:, 0:100].values
y = data.iloc[:, 100].values

from sklearn.model_selection import train_test_split

seed = 0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed)

print(X.shape)
print(y.shape)

(2214, 100)
(2214,)


In [6]:
from sklearn.preprocessing import StandardScaler

# Standardize features by removing the mean and scaling to unit variance

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

ValueError: could not convert string to float: 'Ballwincity'

In [None]:
from sklearn.decomposition import PCA

c = 14
pca = PCA(n_components = c)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print("Amount of variance: %s" % pca.explained_variance_)
print("Sum of the variance: %s" % sum(pca.explained_variance_).round(2))

print("Percentage of variance: %s" % pca.explained_variance_ratio_)
print("Sum of the percentage of variance: %s" % sum(pca.explained_variance_ratio_).round(2))


plt.scatter(np.arange(1,(c+1)),pca.explained_variance_, c = 'red')
plt.plot((0,15), (1,1), color = 'black', linestyle = 'dashed')
plt.xlabel('PC')
plt.ylabel('Amount of variance explained')
plt.show()
print(X_train.shape)