In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import scipy
import sklearn
from sklearn import linear_model
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline

import warnings
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

In [2]:
crime = pd.read_excel("Validate Regression.xls", encoding = "ISO-8859-1", skiprows=4) 
#Rows were skipped to avoid any miscalculated data.

In [3]:
crime.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Abbeville,2888.0,3.0,0.0,,0.0,2.0,1.0,22.0,3.0,16.0,3.0,0.0
1,Adairsville,4686.0,13.0,0.0,,0.0,1.0,12.0,52.0,15.0,31.0,6.0,0.0
2,Adel,5240.0,18.0,0.0,,5.0,5.0,8.0,189.0,64.0,121.0,4.0,0.0
3,Adrian,656.0,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0
4,Alapaha,646.0,5.0,0.0,,0.0,1.0,4.0,6.0,0.0,6.0,0.0,0.0


In [4]:
crime_change = crime.rename(index=str, columns={"Property\ncrime": "Property_crime", "Violent\ncrime": "Violent_crime"})

In [5]:
crime_change.head()

Unnamed: 0,City,Population,Violent_crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property_crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Abbeville,2888.0,3.0,0.0,,0.0,2.0,1.0,22.0,3.0,16.0,3.0,0.0
1,Adairsville,4686.0,13.0,0.0,,0.0,1.0,12.0,52.0,15.0,31.0,6.0,0.0
2,Adel,5240.0,18.0,0.0,,5.0,5.0,8.0,189.0,64.0,121.0,4.0,0.0
3,Adrian,656.0,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0
4,Alapaha,646.0,5.0,0.0,,0.0,1.0,4.0,6.0,0.0,6.0,0.0,0.0


In [6]:
crime_change.columns

Index(['City', 'Population', 'Violent_crime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property_crime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3'],
      dtype='object')

In [7]:
crime_change.fillna(crime_change.mean(),inplace=True)

In [8]:
crime_change['PopulationSquared'] = crime_change['Population'] * crime_change['Population']
crime_change['DidMurderExist'] = np.where(crime_change.iloc[:,3]>0, 1,0)
crime_change['DidRobberyExist'] = np.where(crime_change['Robbery']>0, 1, 0)
crime_change['DidArsonExist'] = np.where(crime_change['Arson3']>0, 1, 0)
crime_change['DidRapeExist'] = np.where(crime_change.iloc[:,5]>0, 1,0)

In [19]:
crime_change.head()

Unnamed: 0,City,Population,Violent_crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property_crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,PopulationSquared,DidMurderExist,DidRobberyExist,DidArsonExist,DidRapeExist
0,Abbeville,2888.0,3.0,0.0,,0.0,2.0,1.0,22.0,3.0,16.0,3.0,0.0,8340544.0,0,1,0,0
1,Adairsville,4686.0,13.0,0.0,,0.0,1.0,12.0,52.0,15.0,31.0,6.0,0.0,21958596.0,0,1,0,0
2,Adel,5240.0,18.0,0.0,,5.0,5.0,8.0,189.0,64.0,121.0,4.0,0.0,27457600.0,0,1,0,1
3,Adrian,656.0,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,430336.0,0,0,0,0
4,Alapaha,646.0,5.0,0.0,,0.0,1.0,4.0,6.0,0.0,6.0,0.0,0.0,417316.0,0,1,0,0


In [24]:
regr = linear_model.LinearRegression()
Y = crime_change['Property_crime'].values.reshape(-1, 1)
X = crime_change[['DidArsonExist','DidRapeExist','DidRobberyExist','DidMurderExist','Population','PopulationSquared']]
regr.fit(X, Y)

print('Here are the coefficients:', regr.coef_)
print('Here is the intercept:', regr.intercept_)
print('The r-squared:', regr.score(X, Y))

Here are the coefficients: [[7.58531099e+01 5.26358189e+01 6.89700121e+01 2.33228422e+02
  2.94664879e-02 6.75344580e-08]]
Here is the intercept: [-52.38251244]
The r-squared: 0.9412243447630947


Original Linear Model

In [11]:
linear_formula = 'Property_crime ~ DidArsonExist+DidRobberyExist+DidRapeExist+DidMurderExist+Population+PopulationSquared'

lm = smf.ols(formula=linear_formula, data=crime_change).fit()

In [12]:
lm.params

Intercept           -52.383
DidArsonExist        75.853
DidRobberyExist      68.970
DidRapeExist         52.636
DidMurderExist      233.229
Population            0.029
PopulationSquared     0.000
dtype: float64

In [20]:
lm.pvalues #TheVariablesofTheExist (except murder) have p-values more than 0.05. This means that the we conclude that those
#variables do not have a significant difference in the model.

Intercept           0.381
DidArsonExist       0.275
DidRobberyExist     0.379
DidRapeExist        0.508
DidMurderExist      0.013
Population          0.000
PopulationSquared   0.000
dtype: float64

In [21]:
print(lm.rsquared) #OriginalModel

0.9412243447630896


Revised Model after P-Values over 0.05 were deleted

In [None]:
linear_formula = 'Property_crime ~ DidMurderExist+Population+PopulationSquared'

ma = smf.ols(formula=linear_formula, data=crime_change).fit()

In [15]:
ma.params

Intercept            25.555
DidMurderExist      294.138
Population            0.031
PopulationSquared     0.000
dtype: float64

In [16]:
ma.pvalues

Intercept           0.507
DidMurderExist      0.001
Population          0.000
PopulationSquared   0.000
dtype: float64

In [22]:
#After the the p-values were deleted, the final rsquared value is
print(ma.rsquared)

0.9404902722056525
