In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

  from pandas.core import datetools


## Dataset: Diagnostic Wisconsin Breast Cancer Dataset

__Source:__ https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29

In [3]:
#load/clean/prep data
df = pd.read_csv('breast_cancer_data.csv')

#remove last column of Nans
df = df.drop('Unnamed: 32',1)

#change diagnosis to 0 (benign) or 1 (malignant)
df['diagnosis'] = df['diagnosis'].map(lambda x: 1 if x == 'M' else 0)

#get variables with above .7 correlation to diagnosis
df_temp = pd.DataFrame(df.corr()['diagnosis'])
df_temp = df_temp.loc[df_temp['diagnosis'] >= .7].sort_values(['diagnosis'],
                                                             ascending=False)
df_temp = df_temp.reset_index()
variables = df_temp['index'].tolist()

#dataframe with only selected variables
df2 = df[variables]

#create train and test sets
trainsize = int(df2.shape[0] / 2)
df2_test = df2.iloc[trainsize:, :].copy()
df2_train = df2.iloc[:trainsize, :].copy()

#create data/target train and tests
Y_train = df2_train['diagnosis']
X_train = df2_train.loc[:, ~(df2_train.columns).isin(['diagnosis'])]
Y_test = df2_test['diagnosis']
X_test = df2_test.loc[:, ~(df2_test.columns).isin(['diagnosis'])]

In [4]:
#train linear regression
regr = linear_model.LinearRegression()
regr.fit(X_train,Y_train)
print('\nR-squared regr')
print(regr.score(X_train,Y_train))

#test linear regression
print('\nR-squared regr')
print(regr.score(X_test,Y_test))


R-squared regr
0.7118303532542507

R-squared regr
0.63665110789687


In [5]:
#train ridge
ridgeregr = linear_model.Ridge(alpha=.01, fit_intercept=False)
ridgeregr.fit(X_train,Y_train)
print('\nR-squared regr')
print(ridgeregr.score(X_train,Y_train))

#test ridge
print('\nR-squared regr')
print(ridgeregr.score(X_test,Y_test))


R-squared regr
0.6867575100055208

R-squared regr
0.6454724979591079


In [6]:
#train lasso model
lass = linear_model.Lasso(alpha=.02)
lass.fit(X_train, Y_train)
print('\nR-squared train:')
print(lass.score(X_train, Y_train))

#run lasso on test
print('\nR-squared train:')
print(lass.score(X_test, Y_test))


R-squared train:
0.6506812508302292

R-squared train:
0.515935196268377


### Results

Ridge performs slightly better and overfits less, lasso performs worse

## NY Crime Data

In [7]:
raw_data = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv',
                      skiprows=[0,1,2,3])
raw_data.columns =['city',
                   'population',
                   'violent_crime',
                   'murder',
                   'rape_def_1',
                   'rape_def_2',
                  'robbery',
                  'aggravated_assault',
                  'property_crime',
                  'burglary',
                  'larceny_theft',
                  'motor_vehicle_theft',
                  'arson']

#set dataframe
df = raw_data[['city', 'population']].copy()

#get population^2 variable
df = df.dropna(axis=0, subset=['population']) #drop nan values
df['population'] = df['population'].str.replace(',', '') #get rid of commas
df['population'] = df['population'].astype(str).astype(int) #from object cast to string then int
#df['population_sq'] = df['population'] ** 2 #square column and put in new column

#add other desired variables
df['murder'], df['robbery'] = raw_data['murder'], raw_data['robbery']
df.head()

#prepare murder and robbery to be iterated on
df['murder'] = df['murder'].astype(int)
df['robbery'] = df['robbery'].str.replace(',', '')
df['robbery'] = df['robbery'].astype(int)

#change values to 0 or 1 for multivariable regression
#df.loc[df['murder'] > 0, ['murder']] = 1
#df.loc[df['robbery'] > 0, ['robbery']] = 1

#add property crime variable
df['property_crime'] = raw_data['property_crime']
df['property_crime'] = df['property_crime'].str.replace(',', '') #get rid of commas
df['property_crime'] = df['property_crime'].astype(int)

#add some more variables for new regression
#df['burglary'] = raw_data['burglary'].str.replace(',','')
#df['burglary'] = df['burglary'].astype(int)
#df.loc[df['burglary'] > 0, ['burglary']] = 1

#df['motor_vehicle_theft'] = raw_data['motor_vehicle_theft'].str.replace(',','')
#df['motor_vehicle_theft'] = df['motor_vehicle_theft'].astype(int)
#df.loc[df['motor_vehicle_theft'] > 0, ['motor_vehicle_theft']] = 1

#df['rape_def_2'] = raw_data['rape_def_2'].str.replace(',','')
#df['rape_def_2'] = df['rape_def_2'].astype(int)
#df.loc[df['rape_def_2'] > 0, ['rape_def_2']] = 1

df.head()

Unnamed: 0,city,population,murder,robbery,property_crime
0,Adams Village,1861,0,0,12
1,Addison Town and Village,2577,0,0,24
2,Akron Village,2846,0,0,16
3,Albany,97956,8,227,4090
4,Albion Village,6388,0,4,223


In [8]:
#create train and test sets
trainsize = int(df.shape[0] / 2)
df_test_crime = df.iloc[trainsize:, :].copy()
df_train_crime = df.iloc[:trainsize, :].copy()

#create data/target train and tests
Y_train_crime = df_train_crime['property_crime']
X_train_crime = df_train_crime.loc[:,
                                   ~(df_train_crime.columns).isin(['property_crime'])]
X_train_crime = X_train_crime.drop('city', 1)
Y_test_crime = df_test_crime['property_crime']
X_test_crime = df_test_crime.loc[:,
                                 ~(df_test_crime.columns).isin(['property_crime'])]
X_test_crime = X_test_crime.drop('city', 1)

In [9]:
regr.fit(X_train_crime,Y_train_crime)
print('\nR-squared regr')
print(regr.score(X_train_crime,Y_train_crime))

#test linear regression
print('\nR-squared regr')
print(regr.score(X_test_crime,Y_test_crime))


R-squared regr
0.9684363395770053

R-squared regr
-0.8761937791325166


In [10]:
#train ridge
ridgeregr.fit(X_train_crime,Y_train_crime)
print('\nR-squared ridge train')
print(ridgeregr.score(X_train_crime,Y_train_crime))

#test ridge
print('\nR-squared ridge test')
print(ridgeregr.score(X_test_crime,Y_test_crime))


R-squared ridge train
0.9683497990655007

R-squared ridge test
-0.8525795342954146


In [11]:
#train lasso model
lass.fit(X_train_crime, Y_train_crime)
print('\nR-squared lasso train:')
print(lass.score(X_train_crime, Y_train_crime))

#run lasso on test
print('\nR-squared lasso test:')
print(lass.score(X_test_crime, Y_test_crime))


R-squared lasso train:
0.9684363374533491

R-squared lasso test:
-0.8755187247566971




In [19]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()

print(cross_val_score(rfc, X_train, Y_train, cv=3))

[0.9375     0.89361702 0.94680851]
