In [85]:
# see data/crime/communities.names for information on this data (from UCI repository)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [86]:
crime_data = np.genfromtxt("data/crime/communities.data",delimiter=",")
# delete columns for state, county, community and community name due to missing data and low predictive quality; fold removed
# remove stats relating to police - around 85% missing data
arr1 = np.array([0,1,2,3,4])
arr2 = np.arange(101,118,1)
arr3 = np.arange(121,125,1)
arr = np.concatenate([arr1,arr2,arr3,[126]])
crime_data_r = np.delete(crime_data, arr,axis=1)

In [87]:
crime_data_r[0,:]

array([0.19, 0.33, 0.02, 0.9 , 0.12, 0.17, 0.34, 0.47, 0.29, 0.32, 0.2 ,
       1.  , 0.37, 0.72, 0.34, 0.6 , 0.29, 0.15, 0.43, 0.39, 0.4 , 0.39,
       0.32, 0.27, 0.27, 0.36, 0.41, 0.08, 0.19, 0.1 , 0.18, 0.48, 0.27,
       0.68, 0.23, 0.41, 0.25, 0.52, 0.68, 0.4 , 0.75, 0.75, 0.35, 0.55,
       0.59, 0.61, 0.56, 0.74, 0.76, 0.04, 0.14, 0.03, 0.24, 0.27, 0.37,
       0.39, 0.07, 0.07, 0.08, 0.08, 0.89, 0.06, 0.14, 0.13, 0.33, 0.39,
       0.28, 0.55, 0.09, 0.51, 0.5 , 0.21, 0.71, 0.52, 0.05, 0.26, 0.65,
       0.14, 0.06, 0.22, 0.19, 0.18, 0.36, 0.35, 0.38, 0.34, 0.38, 0.46,
       0.25, 0.04, 0.  , 0.12, 0.42, 0.5 , 0.51, 0.64, 0.12, 0.26, 0.2 ,
       0.32, 0.2 ])

In [88]:
#create test set (borrowed from Hands on Machine Learning with Scikit-Learn...)
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return test_indices, train_indices

In [89]:
np.random.seed(8211981) # set seed, data will never change
test_slice, train_slice = split_train_test(crime_data_r, 0.2)
test_set = crime_data_r[test_slice,:]  # TEST SET preserved
train_set = crime_data_r[train_slice,:]

In [90]:
train_set_c = train_set.copy() # duplicate training set

In [91]:
x_train = train_set_c[:,:-1] #features
y_train = train_set_c[:,-1] # predictions
#train_pred_m = np.tile(train_pred[np.newaxis].transpose(), (1,train_var.shape[1])) # prediction matrix
#train_coeff = np.corrcoef(train_var, train_pred_m)

In [92]:
x_train.shape, y_train.shape

((1596, 100), (1596,))

In [93]:
# normalize data
from sklearn.preprocessing import normalize

x_train_norm = normalize(x_train, axis=0)

In [111]:
correlation_matrix = []
for col in x_train_norm.T:
    correlation_matrix.append(np.correlate(col, y_train))

correlation_matrix_array = np.asarray(correlation_matrix)
np.max(correlation_matrix_array), correlation_matrix[0]

(11.54749346115739, array([7.06486455]))

In [107]:
correlation_dictionary = {}
#type(correlation_matrix[0][0])
for index, corrval in enumerate(correlation_matrix):
    #print("{} {}".format(index, corrval))
    correlation_dictionary[index] = correlation_matrix[index][0].item()

In [117]:
# instead of using pandas or some other way to get max correlation, using simple dictionary store
import operator
max_corr = max(correlation_dictionary.items(), key=operator.itemgetter(1))[0]
correlation_dictionary[max_corr]

11.54749346115739

In [118]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_reg.fit(x_train_norm, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=1000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [42]:
# -- population: population for community: (numeric - decimal)
# -- householdsize: mean people per household (numeric - decimal)
# -- racepctblack: percentage of population that is african american (numeric - decimal)
# -- racePctWhite: percentage of population that is caucasian (numeric - decimal)
# -- racePctAsian: percentage of population that is of asian heritage (numeric - decimal)
# -- racePctHisp: percentage of population that is of hispanic heritage (numeric - decimal)
# -- agePct12t21: percentage of population that is 12-21 in age (numeric - decimal)
# -- agePct12t29: percentage of population that is 12-29 in age (numeric - decimal)
# -- agePct16t24: percentage of population that is 16-24 in age (numeric - decimal)
# -- agePct65up: percentage of population that is 65 and over in age (numeric - decimal)
# -- numbUrban: number of people living in areas classified as urban (numeric - decimal)
# -- pctUrban: percentage of people living in areas classified as urban (numeric - decimal)
# -- medIncome: median household income (numeric - decimal)
# -- pctWWage: percentage of households with wage or salary income in 1989 (numeric - decimal)
# -- pctWFarmSelf: percentage of households with farm or self employment income in 1989 (numeric - decimal)
# -- pctWInvInc: percentage of households with investment / rent income in 1989 (numeric - decimal)
# -- pctWSocSec: percentage of households with social security income in 1989 (numeric - decimal)
# -- pctWPubAsst: percentage of households with public assistance income in 1989 (numeric - decimal)
# -- pctWRetire: percentage of households with retirement income in 1989 (numeric - decimal)
# -- medFamInc: median family income (differs from household income for non-family households) (numeric - decimal)
# -- perCapInc: per capita income (numeric - decimal)
# -- whitePerCap: per capita income for caucasians (numeric - decimal)
# -- blackPerCap: per capita income for african americans (numeric - decimal)
# -- indianPerCap: per capita income for native americans (numeric - decimal)
# -- AsianPerCap: per capita income for people with asian heritage (numeric - decimal)
# -- OtherPerCap: per capita income for people with 'other' heritage (numeric - decimal)
# -- HispPerCap: per capita income for people with hispanic heritage (numeric - decimal)
# -- NumUnderPov: number of people under the poverty level (numeric - decimal)
# -- PctPopUnderPov: percentage of people under the poverty level (numeric - decimal)
# -- PctLess9thGrade: percentage of people 25 and over with less than a 9th grade education (numeric - decimal)
# -- PctNotHSGrad: percentage of people 25 and over that are not high school graduates (numeric - decimal)
# -- PctBSorMore: percentage of people 25 and over with a bachelors degree or higher education (numeric - decimal)
# -- PctUnemployed: percentage of people 16 and over, in the labor force, and unemployed (numeric - decimal)
# -- PctEmploy: percentage of people 16 and over who are employed (numeric - decimal)
# -- PctEmplManu: percentage of people 16 and over who are employed in manufacturing (numeric - decimal)
# -- PctEmplProfServ: percentage of people 16 and over who are employed in professional services (numeric - decimal)
# -- PctOccupManu: percentage of people 16 and over who are employed in manufacturing (numeric - decimal) ########
# -- PctOccupMgmtProf: percentage of people 16 and over who are employed in management or professional occupations (numeric - decimal)
# -- MalePctDivorce: percentage of males who are divorced (numeric - decimal)
# -- MalePctNevMarr: percentage of males who have never married (numeric - decimal)
# -- FemalePctDiv: percentage of females who are divorced (numeric - decimal)
# -- TotalPctDiv: percentage of population who are divorced (numeric - decimal)
# -- PersPerFam: mean number of people per family (numeric - decimal)
# -- PctFam2Par: percentage of families (with kids) that are headed by two parents (numeric - decimal)
# -- PctKids2Par: percentage of kids in family housing with two parents (numeric - decimal)
# -- PctYoungKids2Par: percent of kids 4 and under in two parent households (numeric - decimal)
# -- PctTeen2Par: percent of kids age 12-17 in two parent households (numeric - decimal)
# -- PctWorkMomYoungKids: percentage of moms of kids 6 and under in labor force (numeric - decimal)
# -- PctWorkMom: percentage of moms of kids under 18 in labor force (numeric - decimal)
# -- NumIlleg: number of kids born to never married (numeric - decimal)
# -- PctIlleg: percentage of kids born to never married (numeric - decimal)
# -- NumImmig: total number of people known to be foreign born (numeric - decimal)
# -- PctImmigRecent: percentage of _immigrants_ who immigated within last 3 years (numeric - decimal)
# -- PctImmigRec5: percentage of _immigrants_ who immigated within last 5 years (numeric - decimal)
# -- PctImmigRec8: percentage of _immigrants_ who immigated within last 8 years (numeric - decimal)
# -- PctImmigRec10: percentage of _immigrants_ who immigated within last 10 years (numeric - decimal)
# -- PctRecentImmig: percent of _population_ who have immigrated within the last 3 years (numeric - decimal)
# -- PctRecImmig5: percent of _population_ who have immigrated within the last 5 years (numeric - decimal)
# -- PctRecImmig8: percent of _population_ who have immigrated within the last 8 years (numeric - decimal)
# -- PctRecImmig10: percent of _population_ who have immigrated within the last 10 years (numeric - decimal)
# -- PctSpeakEnglOnly: percent of people who speak only English (numeric - decimal)
# -- PctNotSpeakEnglWell: percent of people who do not speak English well (numeric - decimal)
# -- PctLargHouseFam: percent of family households that are large (6 or more) (numeric - decimal)
# -- PctLargHouseOccup: percent of all occupied households that are large (6 or more people) (numeric - decimal)
# -- PersPerOccupHous: mean persons per household (numeric - decimal)
# -- PersPerOwnOccHous: mean persons per owner occupied household (numeric - decimal)
# -- PersPerRentOccHous: mean persons per rental household (numeric - decimal)
# -- PctPersOwnOccup: percent of people in owner occupied households (numeric - decimal)
# -- PctPersDenseHous: percent of persons in dense housing (more than 1 person per room) (numeric - decimal)
# -- PctHousLess3BR: percent of housing units with less than 3 bedrooms (numeric - decimal)
# -- MedNumBR: median number of bedrooms (numeric - decimal)
# -- HousVacant: number of vacant households (numeric - decimal)
# -- PctHousOccup: percent of housing occupied (numeric - decimal)
# -- PctHousOwnOcc: percent of households owner occupied (numeric - decimal)
# -- PctVacantBoarded: percent of vacant housing that is boarded up (numeric - decimal)
# -- PctVacMore6Mos: percent of vacant housing that has been vacant more than 6 months (numeric - decimal)
# -- MedYrHousBuilt: median year housing units built (numeric - decimal)
# -- PctHousNoPhone: percent of occupied housing units without phone (in 1990, this was rare!) (numeric - decimal)
# -- PctWOFullPlumb: percent of housing without complete plumbing facilities (numeric - decimal)
# -- OwnOccLowQuart: owner occupied housing - lower quartile value (numeric - decimal)
# -- OwnOccMedVal: owner occupied housing - median value (numeric - decimal)
# -- OwnOccHiQuart: owner occupied housing - upper quartile value (numeric - decimal)
# -- RentLowQ: rental housing - lower quartile rent (numeric - decimal)
# -- RentMedian: rental housing - median rent (Census variable H32B from file STF1A) (numeric - decimal)
# -- RentHighQ: rental housing - upper quartile rent (numeric - decimal)
# -- MedRent: median gross rent (Census variable H43A from file STF3A - includes utilities) (numeric - decimal)
# -- MedRentPctHousInc: median gross rent as a percentage of household income (numeric - decimal)
# -- MedOwnCostPctInc: median owners cost as a percentage of household income - for owners with a mortgage (numeric - decimal)
# -- MedOwnCostPctIncNoMtg: median owners cost as a percentage of household income - for owners without a mortgage (numeric - decimal)
# -- NumInShelters: number of people in homeless shelters (numeric - decimal)
# -- NumStreet: number of homeless people counted in the street (numeric - decimal)
# -- PctForeignBorn: percent of people foreign born (numeric - decimal)
# -- PctBornSameState: percent of people born in the same state as currently living (numeric - decimal)
# -- PctSameHouse85: percent of people living in the same house as in 1985 (5 years before) (numeric - decimal)
# -- PctSameCity85: percent of people living in the same city as in 1985 (5 years before) (numeric - decimal)
# -- PctSameState85: percent of people living in the same state as in 1985 (5 years before) (numeric - decimal)
# -- LandArea: land area in square miles (numeric - decimal)
# -- PopDens: population density in persons per square mile (numeric - decimal)
# -- PctUsePubTrans: percent of people using public transit for commuting (numeric - decimal)

# TARGET
# -- ViolentCrimesPerPop: total number of violent crimes per 100K popuation (numeric - decimal) GOAL attribute (to be predicted)

population = 
householdsize =
racepctblack = 
racePctWhite = 
racePctAsian = 
racePctHisp = 
agePct12t21 = 
agePct12t29 = 
agePct16t24 = 
agePct65up = 
numbUrban = 
pctUrban = 
medIncome = 
pctWWage = 
pctWFarmSelf = 
pctWInvInc = 
pctWSocSec = 
pctWPubAsst = 
pctWRetire = 
medFamInc = 
perCapInc = 
whitePerCap = 
blackPerCap = 
indianPerCap = 
AsianPerCap = 
OtherPerCap = 
HispPerCap = 
NumUnderPov = 
PctPopUnderPov = 
PctLess9thGrade = 
PctNotHSGrad = 
PctBSorMore = 
PctUnemployed = 
PctEmploy = 
PctEmplManu = 
PctEmplProfServ = 
PctOccupManu = 
PctOccupMgmtProf = 
MalePctDivorce = 
MalePctNevMarr = 
FemalePctDiv = 
TotalPctDiv = 
PersPerFam = 
PctFam2Par = 
PctKids2Par = 
PctYoungKids2Par = 
PctTeen2Par = 
PctWorkMomYoungKids = 
PctWorkMom = 
NumIlleg = 
PctIlleg = 
NumImmig = 
PctImmigRecent = 
PctImmigRec5 = 
PctImmigRec8 = 
PctImmigRec10 = 
PctRecentImmig = 
PctRecImmig5 = 
PctRecImmig8 = 
PctRecImmig10 = 
PctSpeakEnglOnly = 
PctNotSpeakEnglWell = 
PctLargHouseFam = 
PctLargHouseOccup = 
PersPerOccupHous = 
PersPerOwnOccHous = 
PersPerRentOccHous = 
PctPersOwnOccup = 
PctPersDenseHous = 
PctHousLess3BR = 
MedNumBR = 
HousVacant = 
PctHousOccup = 
PctHousOwnOcc = 
PctVacantBoarded = 
PctVacMore6Mos = 
MedYrHousBuilt = 
PctHousNoPhone = 
PctWOFullPlumb = 
OwnOccLowQuart = 
OwnOccMedVal = 
OwnOccHiQuart = 
RentLowQ = 
RentMedian = 
RentHighQ = 
MedRent = 
MedRentPctHousInc = 
MedOwnCostPctInc = 
MedOwnCostPctIncNoMtg = 
NumInShelters = 
NumStreet = 
PctForeignBorn = 
PctBornSameState = 
PctSameHouse85 = 
PctSameCity85 = 
PctSameState85 = 
LandArea = 
PopDens = 
PctUsePubTrans = 

ViolentCrimesPerPop = sgd_reg.predict(...)

array([2.5e+01, 8.0e+00, 3.0e-02, 3.1e-01, 3.0e-02, 9.4e-01, 7.0e-02,
       5.0e-02, 3.1e-01, 4.4e-01, 2.7e-01, 5.0e-01, 4.0e-02, 1.0e+00,
       3.4e-01, 5.3e-01, 1.7e-01, 5.0e-01, 5.2e-01, 3.2e-01, 6.6e-01,
       3.8e-01, 3.7e-01, 3.6e-01, 3.6e-01, 2.1e-01, 3.1e-01, 2.9e-01,
       3.5e-01, 2.0e-02, 2.1e-01, 2.2e-01, 3.6e-01, 3.0e-01, 3.0e-01,
       5.4e-01, 4.3e-01, 4.0e-01, 3.9e-01, 3.9e-01, 5.8e-01, 4.7e-01,
       5.0e-01, 5.6e-01, 3.9e-01, 5.7e-01, 6.2e-01, 5.8e-01, 6.2e-01,
       4.5e-01, 5.2e-01, 1.0e-02, 1.9e-01, 1.0e-02, 3.0e-01, 2.9e-01,
       3.8e-01, 3.5e-01, 1.1e-01, 1.0e-01, 1.2e-01, 1.0e-01, 8.4e-01,
       8.0e-02, 1.3e-01, 1.1e-01, 3.2e-01, 4.6e-01, 1.8e-01, 5.6e-01,
       7.0e-02, 5.8e-01, 0.0e+00, 3.0e-02, 8.1e-01, 4.9e-01, 1.1e-01,
       3.8e-01, 3.3e-01, 1.3e-01, 9.0e-02, 3.2e-01, 3.1e-01, 3.1e-01,
       4.0e-01, 4.1e-01, 4.4e-01, 3.7e-01, 4.2e-01, 4.1e-01, 4.2e-01,
       0.0e+00, 0.0e+00, 1.6e-01, 7.9e-01, 6.5e-01, 8.8e-01, 8.1e-01,
       5.0e-02, 1.4e