In [15]:
import math

from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import scipy
import statsmodels.formula.api as smf
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

In [16]:
df = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')

In [17]:
# Delete first three rows
df = df.drop([0,1,2], axis=0)

# Make first row the column headers
df = df.reset_index(drop=True)
df.columns = df.iloc[0]
df = df.drop([0], axis=0)
df = df.reset_index(drop=True)

# Rename all column headers
df.columns = ['City', 'Population', 'Violent Crime', 'Murder and Nonnegligent Manslaughter', 'Rape (revised definition)', 'Rape (legacy definition)', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-Theft', 'Motor Vehicle Theft', 'Arson']

# Remove entire Unnamed: 4 or 'Rape revised defintion'
df = df.drop('Rape (revised definition)', axis=1)

# Remove null objects
df = df.drop([348, 349, 350], axis=0)
df = df.drop('Arson', axis=1)
df = df.dropna(how='all')

# Convert all columns from object to integer values
df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']] = df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']].astype(int)

# Remove New York and buffalo from data because it skews the data. Although the data is correct
df[df['City'] == 'New York']
df = df[df['City']!='New York']

# Change name of Property Crime so that it's one word
df['Propertycrime'] = df['Property Crime']
df = df.drop(columns=['Property Crime'])

In [18]:
# Change name of variables to one word
df['Larcenytheft'] = df['Larceny-Theft']
df = df.drop(columns=['Larceny-Theft'])

df['Motorvehicletheft'] = df['Motor Vehicle Theft']
df = df.drop(columns=['Motor Vehicle Theft'])

In [19]:
#Make variables binary
df['Robbery_binary'] = np.where(df['Robbery']>0, '1', '0')

In [20]:
# Propertycrime = Robbery_binary + Burglary + Larcenytheft + Motorvehicletheft
regr = linear_model.LinearRegression()
X = df['Propertycrime'].values.reshape(-1, 1)
Y = df[['Robbery_binary', 'Burglary', 'Larcenytheft', 'Motorvehicletheft']]
regr.fit(a,b)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(X,Y))


Coefficients: 
 [[1.26560102e-04]
 [2.50146661e-01]
 [6.87755598e-01]
 [6.20977407e-02]]

Intercept: 
 [  0.5506028  -24.32170443  33.6910277   -9.36932327]

R-squared:
 0.9843468966715793


In [21]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Robbery_binary+Burglary+Larcenytheft+Motorvehicletheft'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df).fit()

In [22]:
lm.params

Intercept              0.000
Robbery_binary[T.1]   -0.000
Burglary               1.000
Larcenytheft           1.000
Motorvehicletheft      1.000
dtype: float64

In [23]:
lm.pvalues

Intercept             0.000
Robbery_binary[T.1]   0.000
Burglary              0.000
Larcenytheft          0.000
Motorvehicletheft     0.000
dtype: float64