In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

- Importing pandas, matplotlib, seaborn, and numpy

# Data Cleaning

In [2]:
cg = pd.read_csv("CrimeGradeV3.csv")
cg = cg.drop_duplicates()
cg['zipcode'] = cg['zipcode'].astype(int)
cg['overall'] = cg['overall'].astype(int)
cg['property'] = cg['property'].astype(int)
cg['violent'] = cg['violent'].astype(int)
cg['other'] = cg['other'].astype(int)
cg.info()

- Changing data types to integers

In [3]:
hp = pd.read_csv("HousePricesV3.csv")
hp = hp.drop_duplicates()
hp = hp.drop('address', axis=1)
hp['price'] = hp['price'].astype(int)
hp['zipcode'] = hp['zipcode'].astype(int)
hp.info()
hp.head()

- Changing data types to integers
- Dropping the Address column as it is unecessary for analysis

In [4]:
zcd = pd.read_csv("zip_code_database - Sheet2.csv")
zcd = zcd.drop(columns = ['type','decommissioned','primary_city','acceptable_cities','unacceptable_cities','state','county','timezone','area_codes','world_region','country','latitude','longitude'])
zcd.rename(columns = {'zip':'zipcode'}, inplace = True)
zcd['zipcode'] = zcd['zipcode'].astype(int)
zcd.info()

- Renaming columns zip to zipcode
- Changing zip column to type int

In [5]:
grouped = hp.groupby('zipcode')['price'].mean().reset_index()
pd.options.display.float_format = '{:.0f}'.format
grouped.head()

- Group by zipcode and price mean to create a table of zipcode and price.

In [6]:
mergedData = grouped.merge(cg, left_on = 'zipcode', right_on ='zipcode')
mergedData.head()

- Merging grouped table and crime grade table on column zipcode

# Checking for Outliers

In [7]:
sns.boxplot(data = cg['overall'])

### Observations:
- There is an outlier here but, it is within the correct range and is a result of my lack of data.

In [8]:
sns.boxplot(data = cg['violent'])

### Observations:
- There is an outlier here but, it is within the correct range and is a result of my lack of data.

In [9]:
sns.boxplot(data = cg['property'])

### Observations:
- No outliers

In [10]:
sns.boxplot(data = cg['other'])

### Observations:
- No outliers

In [11]:
sns.boxplot(data = hp['price'])

### Observations:
- Seems to contain outliers but, are necessary to keep as data set is too small to afford removal

In [15]:
from scipy.stats import zscore
mergedData['price_zscore'] = zscore(mergedData['price'])
mergedData.head()

### Observations:
- No z-scores are -3 or 3

# Exploratory Data Analysis

In [16]:
sns.pairplot(mergedData)

### Observations:
- Data is all over the place but, seems to lean towards an upward trend.

In [17]:
sns.lineplot(data=mergedData, x="overall", y="price")

### Observations:
- Data up and down but ends on an upward trend

In [18]:
sns.lineplot(data=mergedData, x="violent", y="price")

### Observations:
- Data dips slightly but, is mostly on a positive trend

In [19]:
sns.lineplot(data=mergedData, x="property", y="price")

### Observations:
- Data is up and down but ends on an upward trend especially towards the end

In [20]:
sns.lineplot(data=mergedData, x="other", y="price")

### Observations:
- Data is more sporadic but, ends on an upward trend

In [21]:
from scipy.stats import pearsonr 
from scipy.stats import spearmanr 

In [23]:
spearmanr(mergedData['price'], mergedData['overall'])

In [None]:
mergedData.corr()

In [None]:
spearmanr(mergedData['price'], mergedData['overall'])

- P-value is extremely high so any correlation between price and overall crime is by chance

In [None]:
spearmanr(mergedData['price'], mergedData['violent'])

- Contains a p-value below 0.05 therefore this relationship rejects the null hypothesis

In [None]:
spearmanr(mergedData['price'], mergedData['property'])

- P-value is extremely high so any correlation between price and property crime is by chance

In [None]:
spearmanr(mergedData['price'], mergedData['other'])

- P-value is extremely high so any correlation between price and other crime is by chance

# Linear Regression

In [None]:
import statsmodels.api as sm

# sets X to violent

x = mergedData['violent']

# sets Y to price

y = mergedData['price']

In [None]:
plt.scatter(x, y)
plt.xlabel('Violent Crime')
plt.ylabel('Price');

In [None]:
x = sm.add_constant(x)
results = sm.OLS(y,x).fit()
results.summary()

Interpret the following from your model:

- R-squared = 0.246 25 percent accurate
- Coefficient of independent variable = 4.57e+04 increases by 45700 for every single unit increase of price
- P-value of T-statistic 0.00 We reject the null hypothesis as this is lower than 0.05
- P-value of F-statistic 0.00 

In [None]:
new_df = pd.DataFrame({'constant':1, 'violent':[ 1, 3, 5, 7, 11, 15]})
new_df

In [None]:
predictions = results.predict(new_df)
predictions

In [None]:
new_df['price_predictions'] = results.predict(new_df)
new_df