# Pre-processing data
https://towardsdatascience.com/bayesian-linear-regression-in-python-using-machine-learning-to-predict-student-grades-part-1-7d0ad817fca5

In [81]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import percentileofscore
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
# Visuals
# import seaborn as sns
# from scipy import stats
# %matplotlib inline

**Read in Data**
- NOTE: Dropping spatial variables for the scope of project
- Also, there was little correlation with spatial components in preliminary dummy coding correlation check

In [82]:
# Read data and assign NA to missing values
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/00211/CommViolPredUnnormalizedData.txt',
                   sep='\s*,\s*',encoding='latin-1',engine='python', na_values=["?"])


data.columns = ['communityname','state','countyCode','communityCode','fold','population','householdsize','racepctblack',
           'racePctWhite','racePctAsian','racePctHisp','agePct12t21','agePct12t29','agePct16t24','agePct65up',
           'numbUrban','pctUrban','medIncome','pctWWage','pctWFarmSelf','pctWInvInc','pctWSocSec','pctWPubAsst',
           'pctWRetire','medFamInc','perCapInc','whitePerCap','blackPerCap','indianPerCap','AsianPerCap','OtherPerCap',
           'HispPerCap','NumUnderPov','PctPopUnderPov','PctLess9thGrade','PctNotHSGrad','PctBSorMore','PctUnemployed',
           'PctEmploy','PctEmplManu','PctEmplProfServ','PctOccupManu','PctOccupMgmtProf','MalePctDivorce',
           'MalePctNevMarr','FemalePctDiv','TotalPctDiv','PersPerFam','PctFam2Par','PctKids2Par','PctYoungKids2Par',
           'PctTeen2Par','PctWorkMomYoungKids','PctWorkMom','NumKidsBornNeverMar','PctKidsBornNeverMar','NumImmig',
           'PctImmigRecent','PctImmigRec5','PctImmigRec8','PctImmigRec10','PctRecentImmig','PctRecImmig5',
           'PctRecImmig8','PctRecImmig10','PctSpeakEnglOnly','PctNotSpeakEnglWell','PctLargHouseFam',
           'PctLargHouseOccup','PersPerOccupHous','PersPerOwnOccHous','PersPerRentOccHous','PctPersOwnOccup',
           'PctPersDenseHous','PctHousLess3BR','MedNumBR','HousVacant','PctHousOccup','PctHousOwnOcc','PctVacantBoarded',
           'PctVacMore6Mos','MedYrHousBuilt','PctHousNoPhone','PctWOFullPlumb','OwnOccLowQuart','OwnOccMedVal',
           'OwnOccHiQuart','OwnOccQrange','RentLowQ','RentMedian','RentHighQ','RentQrange','MedRent','MedRentPctHousInc',
           'MedOwnCostPctInc','MedOwnCostPctIncNoMtg','NumInShelters','NumStreet','PctForeignBorn','PctBornSameState',
           'PctSameHouse85','PctSameCity85','PctSameState85','LemasSwornFT','LemasSwFTPerPop','LemasSwFTFieldOps',
           'LemasSwFTFieldPerPop','LemasTotalReq','LemasTotReqPerPop','PolicReqPerOffic','PolicPerPop',
           'RacialMatchCommPol','PctPolicWhite','PctPolicBlack','PctPolicHisp','PctPolicAsian','PctPolicMinor',
           'OfficAssgnDrugUnits','NumKindsDrugsSeiz','PolicAveOTWorked','LandArea','PopDens','PctUsePubTrans',
           'PolicCars','PolicOperBudg','LemasPctPolicOnPatr','LemasGangUnitDeploy','LemasPctOfficDrugUn',
           'PolicBudgPerPop','murders','murdPerPop','rapes','rapesPerPop','robberies','robbbPerPop','assaults',
           'assaultPerPop','burglaries','burglPerPop','larcenies','larcPerPop','autoTheft','autoTheftPerPop','arsons',
           'arsonsPerPop','ViolentCrimesPerPop','nonViolPerPop']

In [83]:
# Drop spatial variables
data = data.drop(columns=['countyCode', 'communityCode', 'communityname', 'state'])

# Drop other predictor variables from features
data = data.drop(columns=['murders','murdPerPop','rapes','rapesPerPop','robberies','robbbPerPop','assaults',
    'assaultPerPop','burglaries','burglPerPop','larcenies','larcPerPop','autoTheft','autoTheftPerPop','arsons',
    'arsonsPerPop','nonViolPerPop'])

**Check data**
- NOTE: We have no categorical or string data.

In [84]:
data.head(5)

Unnamed: 0,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,1,23123,2.82,0.8,95.57,3.44,0.85,11.01,21.3,10.48,...,10.6,2186.7,3.84,,,,,0.0,,127.56
1,1,29344,2.43,0.74,94.33,3.43,2.35,11.36,25.88,11.01,...,10.6,2780.9,4.37,,,,,0.0,,218.59
2,1,16656,2.4,1.7,97.35,0.5,0.7,12.55,25.2,12.19,...,5.2,3217.7,3.31,,,,,0.0,,306.64
3,1,11245,2.76,0.53,89.16,1.17,0.52,24.46,40.53,28.69,...,11.5,974.2,0.38,,,,,0.0,,
4,1,140494,2.45,2.51,95.65,0.9,0.95,18.09,32.89,20.04,...,70.4,1995.7,0.97,,,,,0.0,,442.95


In [85]:
data.shape

(2214, 126)

In [86]:
data.describe()

Unnamed: 0,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
count,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,...,2214.0,2214.0,2214.0,343.0,343.0,343.0,343.0,2214.0,343.0,1993.0
mean,5.496387,53136.57,2.70715,9.338699,83.976296,2.668473,7.952918,14.44673,27.647642,13.976518,...,27.429404,2784.258672,3.038148,185.478134,32176020.0,87.130933,4.285714,0.980605,153577.9,589.353914
std,2.871984,204664.6,0.334091,14.249369,16.421952,4.474113,14.592557,4.519448,6.181506,5.971745,...,109.84651,2829.562169,4.912029,318.542834,110456600.0,10.349612,4.064538,2.877702,203040.9,614.816135
min,1.0,10005.0,1.6,0.0,2.68,0.03,0.12,4.58,9.38,4.64,...,0.9,10.0,0.0,20.0,2380215.0,10.85,0.0,0.0,15260.4,0.0
25%,3.0,14371.25,2.5,0.86,76.31,0.62,0.93,12.25,24.42,11.32,...,7.3,1181.85,0.36,54.0,7275060.0,84.295,0.0,0.0,88094.35,161.73
50%,5.0,22803.0,2.66,2.87,90.35,1.23,2.18,13.62,26.78,12.545,...,13.7,2029.6,1.22,86.0,11164110.0,89.58,5.0,0.0,114582.0,374.07
75%,8.0,43043.5,2.85,11.1575,96.2275,2.67,7.81,15.36,29.2075,14.3475,...,26.1,3322.25,3.3575,189.5,20147540.0,93.2,10.0,0.0,155655.7,794.41
max,10.0,7322564.0,5.28,96.67,99.63,57.46,95.29,54.4,70.51,63.62,...,3569.8,44229.9,54.33,3187.0,1617293000.0,99.94,10.0,48.44,2422367.0,4877.06


In [87]:
# Print the value counts for categorical columns
for col in data.columns:
    if data[col].dtype == 'object':
        print('\nColumn Name:', col,)
        print(data[col].value_counts())

# There are none

**Addressing missing values**

 - If 25% of a columns data is NA, that column is removed
 - For all other missing values, the column mean is used

In [88]:
# Establish how much data is missing
data.isnull().sum().sort_values(ascending=False).head()

LemasSwFTFieldPerPop    1871
LemasTotalReq           1871
PolicReqPerOffic        1871
PolicPerPop             1871
RacialMatchCommPol      1871
dtype: int64

In [89]:
variables_na = []
for i in data:
    if data[i].isnull().sum() > 554: #find the columns with large amounts of na values
        str(i)
        variables_na.append(i)
        
data = data.drop(columns=variables_na) #delete those

In [90]:
data.shape

(2214, 104)

In [91]:
# Impute the missing values using SimpleImputer in sklearn.impute
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(data)
data = pd.DataFrame(data=imp.transform(data), columns=data.columns)

# Check if there is still missing data
data.isnull().sum().sort_values(ascending=False).head()

ViolentCrimesPerPop    0
LemasPctOfficDrugUn    0
HispPerCap             0
NumUnderPov            0
PctPopUnderPov         0
dtype: int64

In [92]:
data.head(5)

Unnamed: 0,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesPerPop
0,1.0,23123.0,2.82,0.8,95.57,3.44,0.85,11.01,21.3,10.48,...,8.3,77.17,71.27,90.22,96.12,10.6,2186.7,3.84,0.0,127.56
1,1.0,29344.0,2.43,0.74,94.33,3.43,2.35,11.36,25.88,11.01,...,5.0,44.77,36.6,61.26,82.85,10.6,2780.9,4.37,0.0,218.59
2,1.0,16656.0,2.4,1.7,97.35,0.5,0.7,12.55,25.2,12.19,...,2.04,88.71,56.7,90.17,96.24,5.2,3217.7,3.31,0.0,306.64
3,1.0,11245.0,2.76,0.53,89.16,1.17,0.52,24.46,40.53,28.69,...,1.74,73.75,42.22,60.34,89.02,11.5,974.2,0.38,0.0,589.353914
4,1.0,140494.0,2.45,2.51,95.65,0.9,0.95,18.09,32.89,20.04,...,1.49,64.35,42.29,70.61,85.66,70.4,1995.7,0.97,0.0,442.95


**Normalize the data set**
- Scales each column to values between 0-1 to be better interpreted by the models
- CHECK: how normalisation impacts model accuracy


In [93]:
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
data.head()
data.describe()

Unnamed: 0,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesPerPop
count,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,...,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0,2214.0
mean,0.499599,0.005898,0.300856,0.096604,0.838538,0.045942,0.082304,0.198048,0.298833,0.1583,...,0.118878,0.634254,0.594971,0.720582,0.824234,0.007433,0.062738,0.05592,0.020244,0.120842
std,0.319109,0.027988,0.090786,0.147402,0.169386,0.077906,0.153331,0.090716,0.101121,0.10125,...,0.139822,0.193923,0.157594,0.158517,0.108684,0.030779,0.063988,0.090411,0.059408,0.119603
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.222222,0.000597,0.244565,0.008896,0.759464,0.010273,0.008511,0.153954,0.246033,0.113259,...,0.031219,0.50191,0.496965,0.642628,0.780826,0.001793,0.026501,0.006626,0.0,0.038042
50%,0.444444,0.00175,0.288043,0.029689,0.904281,0.020895,0.021645,0.181453,0.284639,0.134028,...,0.068582,0.66848,0.604526,0.750947,0.85284,0.003587,0.045672,0.022455,0.0,0.091446
75%,0.777778,0.004518,0.339674,0.115418,0.964905,0.045969,0.080803,0.216379,0.32435,0.16459,...,0.150573,0.78843,0.702795,0.833151,0.897272,0.007061,0.074904,0.061798,0.0,0.148042
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


**Check target variable**
- Distribution
- Percentiles

In [94]:
# Check violent crimes
data['ViolentCrimesPerPop'].value_counts()
data['ViolentCrimesPerPop'].describe()

count    2214.000000
mean        0.120842
std         0.119603
min         0.000000
25%         0.038042
50%         0.091446
75%         0.148042
max         1.000000
Name: ViolentCrimesPerPop, dtype: float64

In [95]:
# # Plot
# def histo_plot(x):
#     plt.hist(x, color=(0.2, 0.4, 0.6, 0.6), alpha=0.5)
#     plt.title("'{var_name}' Histogram".format(var_name=x.name))
#     plt.ylabel("Freq")
#     plt.xlabel("Value")
#     plt.show()
# histo_plot(data['ViolentCrimesPerPop'])


**Add a Crime Classification column**
- Using column quartiles as a suitable threshold base
- CHECK: how classification complexity impacts models
- NOTE: Using +3 classes would be more informative in our model explorations

In [96]:
# 3 fold classification based on quartiles
data['HighCrime'] = pd.qcut(data['ViolentCrimesPerPop'], 4, [0, 1, 2, 3]) # so we take the top 25%
# Binary Classification
data['HighCrime'] = [0 if x < 3 else 1 for x in data['HighCrime']]

# Reformat column
data['HighCrime'] = data['HighCrime'].astype(float)

**Check Variable Correlations**
- No dummy encoding needed because we have no categorical variables

In [97]:
# Correlations of numerical values
data.corr()['HighCrime'].sort_values()

PctKids2Par           -0.587835
racePctWhite          -0.572887
PctFam2Par            -0.546821
PctYoungKids2Par      -0.529816
PctTeen2Par           -0.518076
                         ...   
pctWPubAsst            0.458665
racepctblack           0.486424
PctKidsBornNeverMar    0.570723
ViolentCrimesPerPop    0.784053
HighCrime              1.000000
Name: HighCrime, Length: 105, dtype: float64

**Select Top 100 correlated features for analysis**
- Split data into Training, Testing and Validation sets
- CHECK: how feature complexity impacts model accuracy
- NOTE: When variables <=75 or >=125, the models seems to do worse. 100 seems like a good point to avoid curse of dimensionality.

In [98]:
def format_data(df):
    '''
    Takes in a dataframe, finds the most correlated variables with crime and
    returns training and testing datasets
    '''
    df = df.drop(columns=['ViolentCrimesPerPop'])

    # Find correlations with the Crime
    most_correlated = df.corr().abs()['HighCrime'].sort_values(ascending=False)

    # Maintain the top 100 most correlation features with Crime
    most_correlated = most_correlated[:101]
    df = df.loc[:, most_correlated.index]

    # split data into x & y
    y = df['HighCrime']
    x = df.drop(columns=['HighCrime'])

    return df, x, y

In [99]:
# Format data
df, x, y = format_data(data)

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [100]:
print(x_train.shape)
print(x_test.shape)

(1549, 100)
(665, 100)
