In [49]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

## Multivariable Regression

The 'population' variable is already set for you, but you will need to create the last three features. 

Robbery and Murder are currently continuous variables. For this model, please use these variables to create categorical features where values greater than 0 are coded 1, and values equal to 0 are coded 0.

You'll use this data and model in a later assignment- for now, just write the code you need to get the data ready. 

Don't forget basic data cleaning procedures, either! Do some graphing to see if there are any anomalous cases, and decide how you want to deal with them. (link to cleaning data here https://courses.thinkful.com/data-201v1/assignment/1.3.4)

$$ Property crime = \alpha + Population + Population^2 + Murder + Robbery$$

In [50]:
df = pd.read_excel("/Users/bradleyrhyins/Desktop/Thinkful-Projects/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls", sep=r'\\t')
#df.columns=['review', 'label']

#sms_raw = pd.read_csv(df, delimiter= '\t', header=None)
#sms_raw.columns = ['murder', 'robbery']

In [51]:
df.fillna(0, inplace=True)

In [None]:
#Exploration starts here
df.columns

In [52]:
display(df.head())
#Robbery and Murder are currently continuous variables. 
#For this model, please use these variables to create categorical features where values greater than 0 are coded 1, 
#and values equal to 0 are coded 0.

Unnamed: 0,City,Population,Violent crime,Murder,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,2.0,10.0,0.0,0.0
1,Addison Town and Village,2577.0,3.0,0.0,0.0,0.0,0.0,3.0,24.0,3.0,20.0,1.0,0.0
2,Akron Village,2846.0,3.0,0.0,0.0,0.0,0.0,3.0,16.0,1.0,15.0,0.0,0.0
3,Albany,97956.0,791.0,8.0,0.0,30.0,227.0,526.0,4090.0,705.0,3243.0,142.0,0.0
4,Albion Village,6388.0,23.0,0.0,0.0,3.0,4.0,16.0,223.0,53.0,165.0,5.0,0.0


In [70]:
df["Robbery"] = [1 if x > 0 else 0 for x in df["Robbery"]]
df["Properyci"] = [1 if x > 0 else 0 for x in df["Murder"]]
df["Burglary"] = [1 if x > 0 else 0 for x in df["Burglary"]]

In [71]:
print df.head()

                       City  Population  Violent\ncrime  Murder  \
0             Adams Village    1861.000           0.000       0   
1  Addison Town and Village    2577.000           3.000       0   
2             Akron Village    2846.000           3.000       0   
3                    Albany   97956.000         791.000       1   
4            Albion Village    6388.000          23.000       0   

   Rape\n(revised\ndefinition)1  Rape\n(legacy\ndefinition)2  Robbery  \
0                         0.000                        0.000        0   
1                         0.000                        0.000        0   
2                         0.000                        0.000        0   
3                         0.000                       30.000        1   
4                         0.000                        3.000        1   

   Aggravated\nassault  Property\ncrime  Burglary  Larceny-\ntheft  \
0                0.000           12.000         1           10.000   
1                3

In [72]:
#multipying these two values to create a super value 
df["MurderBurglary"] = [x*y for x, y in zip(df["Robbery"], df["Burglary"])]

In [73]:
# Instantiate and fit our model.
regr = linear_model.LinearRegression()
#use this format for formatting sample examples in course material
Y = df['Murder'].values.reshape(-1, 1)
X = df[['Population','Robbery','Burglary', 'MurderBurglary']]
regr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [82]:
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))

('\nCoefficients: \n', array([[ 1.10269027e-07, -3.79325454e-05,  7.99392596e-03,
         2.15913874e-01]]))
('\nIntercept: \n', array([-0.00014975]))

R-squared:
0.12086997252171693


In [83]:
#predicting those target variables with these features
y1 = regr.predict(X)

In [78]:
df.columns

Index([                        u'City',                   u'Population',
                     u'Violent\ncrime',                       u'Murder',
       u'Rape\n(revised\ndefinition)1',  u'Rape\n(legacy\ndefinition)2',
                            u'Robbery',          u'Aggravated\nassault',
                    u'Property\ncrime',                     u'Burglary',
                    u'Larceny-\ntheft',        u'Motor\nvehicle\ntheft',
                             u'Arson3',               u'MurderBurglary'],
      dtype='object')

In [79]:
df.shape

(351, 14)

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 14 columns):
City                          351 non-null object
Population                    351 non-null float64
Violent
crime                 351 non-null float64
Murder                        351 non-null int64
Rape
(revised
definition)1    351 non-null float64
Rape
(legacy
definition)2     351 non-null float64
Robbery                       351 non-null int64
Aggravated
assault            351 non-null float64
Property
crime                351 non-null float64
Burglary                      351 non-null int64
Larceny-
theft                351 non-null float64
Motor
vehicle
theft           351 non-null float64
Arson3                        351 non-null float64
MurderBurglary                351 non-null int64
dtypes: float64(9), int64(4), object(1)
memory usage: 38.5+ KB


In [81]:
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))

('\nCoefficients: \n', array([[ 1.10269027e-07, -3.79325454e-05,  7.99392596e-03,
         2.15913874e-01]]))
('\nIntercept: \n', array([-0.00014975]))

R-squared:
0.12086997252171693
