In [20]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import sklearn
import warnings
import statsmodels.api as sm
import matplotlib.patches as mpatches
from matplotlib import pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)
%matplotlib inline
sns.set_style('white')

In [21]:
cancer = pd.read_csv("cancer.csv")

In [22]:
cancer.columns

Index(['id', 'Clump_Thickness', 'Size_Uniformity', 'Shape_Uniformity',
       'Marginal_Adhesion', 'Epithelial_Size', 'Bland_Chromatin',
       'Normal_Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [37]:
cancer['Class?'] = np.where(cancer['Class']>3, 1, 0)
cancer.head(10)

Unnamed: 0,id,Clump_Thickness,Size_Uniformity,Shape_Uniformity,Marginal_Adhesion,Epithelial_Size,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class,ClassCat,Class?
0,1000025,5,1,1,1,2,3,1,1,2,0,0
1,1002945,5,4,4,5,7,3,2,1,2,0,0
2,1015425,3,1,1,1,2,3,1,1,2,0,0
3,1016277,6,8,8,1,3,3,7,1,2,0,0
4,1017023,4,1,1,3,2,3,1,1,2,0,0
5,1017122,8,10,10,8,7,9,7,1,4,1,1
6,1018099,1,1,1,1,2,3,1,1,2,0,0
7,1018561,2,1,2,1,2,3,1,1,2,0,0
8,1033078,2,1,1,1,2,1,1,5,2,0,0
9,1033078,4,2,1,1,2,2,1,1,2,0,0


In [38]:
cancer.isnull().any()

id                   False
Clump_Thickness      False
Size_Uniformity      False
Shape_Uniformity     False
Marginal_Adhesion    False
Epithelial_Size      False
Bland_Chromatin      False
Normal_Nucleoli      False
Mitoses              False
Class                False
ClassCat             False
Class?               False
dtype: bool

In [39]:
stats.chisqprob = lambda chisq, cancer: stats.chi2.sf(chisq, cancer)

lr = LogisticRegression(C=1e9)
Y = cancer['Class?']
X = cancer[['Clump_Thickness', 'Size_Uniformity', 'Shape_Uniformity',
       'Marginal_Adhesion']]

X_statsmod = cancer[['Clump_Thickness', 'Size_Uniformity', 'Shape_Uniformity',
       'Marginal_Adhesion']]

X_statsmod['intercept'] = 1 

logit_model=sm.Logit(Y,X)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.566013
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 Class?   No. Observations:                  699
Model:                          Logit   Df Residuals:                      695
Method:                           MLE   Df Model:                            3
Date:                Mon, 21 Jan 2019   Pseudo R-squ.:                  0.1213
Time:                        14:00:24   Log-Likelihood:                -395.64
converged:                       True   LL-Null:                       -450.26
                                        LLR p-value:                 1.598e-23
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Clump_Thickness      -0.3574      0.040     -8.831      0.000      -0.437      -0.278
Size_Uni

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
lreg = LogisticRegression()
lreg.fit(X_train, Y_train)

y_pred = lreg.predict(X_test)
print('The accuracy of logistic regression classifier for this data set {:.3f}.'.format(lreg.score(X_test, Y_test)))

The accuracy of logistic regression classifier for this data set 0.937.




In [41]:
KFold = model_selection.KFold(n_splits=5, random_state=5)
model = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X_train, Y_train, cv=KFold, scoring=scoring)
print("The accuracy rate for the 10-fold cross is %.3f." % (results.mean()))

The accuracy rate for the 10-fold cross is 0.948.




Vanilla Logistic Regression

In [42]:
trainsize = int(cancer.shape[0]*0.75)  
cancer_test = cancer.iloc[trainsize:, :].copy() 
cancer_train = cancer.iloc[:trainsize, :].copy()

regr = linear_model.LinearRegression()
YTrain = cancer_train['Class?'].values.reshape(-1, 1)
XTrain = cancer_train[['Clump_Thickness','Size_Uniformity','Shape_Uniformity','Marginal_Adhesion']]
regr.fit(XTrain, YTrain)
print('\nThe R-squared score for the simple model is:')
print(regr.score(XTrain, YTrain))

origparams = np.append(regr.coef_, regr.intercept_) 


cancer_train['Clump_ThicknessShape_Uniformity'] = cancer_train['Clump_Thickness'] * cancer_train['Shape_Uniformity']
cancer_train['Clump_ThicknessSize_Uniformity'] = cancer_train['Clump_Thickness'] * cancer_train['Size_Uniformity']
cancer_train['Clump_ThicknessMarginal_Adhesion'] = cancer_train['Clump_Thickness'] * cancer_train['Marginal_Adhesion']
cancer_train['Clump_ThicknessRoot'] = cancer_train['Clump_Thickness'] ** 0.5
cancer_train['Clump_ThicknessSquared'] = cancer_train['Clump_Thickness'] ** 2
cancer_train['Clump_ThicknessCubed'] = cancer_train['Clump_Thickness'] ** 3
cancer_train['Size_UniformityShape_Uniformity'] = cancer_train['Size_Uniformity'] * cancer_train['Shape_Uniformity']
cancer_train['Size_UniformityMarginal_Adhesion'] = cancer_train['Size_Uniformity'] * cancer_train['Marginal_Adhesion']
cancer_train['Size_UniformityRoot'] = cancer_train['Size_Uniformity'] ** 0.5
cancer_train['Size_UniformitySquared'] = cancer_train['Size_Uniformity'] ** 2
cancer_train['Size_UniformityCubed'] = cancer_train['Size_Uniformity'] ** 3
cancer_train['Shape_UniformityMarginal_Adhesion'] = cancer_train['Shape_Uniformity'] * cancer_train['Marginal_Adhesion']
cancer_train['Shape_UniformityRoot'] = cancer_train['Shape_Uniformity'] ** 0.5
cancer_train['Shape_UniformitySquared'] = cancer_train['Shape_Uniformity'] ** 2
cancer_train['Shape_UniformityCubed'] = cancer_train['Shape_Uniformity'] ** 3
cancer_train['Marginal_AdhesionRoot'] = cancer_train['Marginal_Adhesion'] ** 0.5
cancer_train['Marginal_AdhesionSquared'] = cancer_train['Marginal_Adhesion'] ** 2
cancer_train['Marginal_AdhesionCubed'] = cancer_train['Marginal_Adhesion'] ** 3
                             
regrBig = linear_model.LinearRegression()
Xtrain2 = cancer_train[['Clump_Thickness','Size_Uniformity','Shape_Uniformity',
                     'Clump_ThicknessShape_Uniformity','Clump_ThicknessMarginal_Adhesion',
                     'Clump_ThicknessSize_Uniformity','Size_UniformityShape_Uniformity', 'Size_UniformityMarginal_Adhesion',
                     'Shape_UniformityRoot','Shape_UniformitySquared','Shape_UniformityCubed','Shape_UniformityMarginal_Adhesion',
                     'Size_UniformityRoot','Size_UniformitySquared','Size_UniformityCubed',
                     'Clump_ThicknessRoot','Clump_ThicknessSquared','Clump_ThicknessCubed',
                     'Marginal_AdhesionRoot', 'Marginal_AdhesionSquared', 'Marginal_AdhesionCubed'
                    ]]
regrBig.fit(Xtrain2, YTrain)
print('\nThe R-squared complex model is:')
print(regrBig.score(Xtrain2, YTrain))

newparams = np.append(
    regrBig.coef_[0,0:(len(origparams)-1)],
    regrBig.intercept_)

print('\nSmall vs. Large Model:')
compare = np.column_stack((origparams, newparams))
comparisons = np.array2string(
    compare,
    formatter={'float_kind':'{0:.4f}'.format})
print(comparisons)


The R-squared score for the simple model is:
0.7467761993645357

The R-squared complex model is:
0.8326194450335352

Small vs. Large Model:
[[0.0529 -1.6529]
 [0.0259 0.0860]
 [0.0497 2.6382]
 [0.0343 -0.0038]
 [-0.2103 0.7095]]


In [43]:
YTest = cancer_test['Class?'].values.reshape(-1, 1)
XTest = cancer_test[['Clump_Thickness','Size_Uniformity','Shape_Uniformity','Marginal_Adhesion']]
print('\nThe R-squared simple model is:')
print(regr.score(XTest, YTest))

cancer_test['Clump_ThicknessShape_Uniformity'] = cancer_test['Clump_Thickness'] * cancer_test['Shape_Uniformity']
cancer_test['Clump_ThicknessSize_Uniformity'] = cancer_test['Clump_Thickness'] * cancer_test['Size_Uniformity']
cancer_test['Clump_ThicknessMarginal_Adhesion'] = cancer_test['Clump_Thickness'] * cancer_test['Marginal_Adhesion']
cancer_test['Clump_ThicknessRoot'] = cancer_test['Clump_Thickness'] ** 0.5
cancer_test['Clump_ThicknessSquared'] = cancer_test['Clump_Thickness'] ** 2
cancer_test['Clump_ThicknessCubed'] = cancer_test['Clump_Thickness'] ** 3
cancer_test['Size_UniformityShape_Uniformity'] = cancer_test['Size_Uniformity'] * cancer_test['Shape_Uniformity']
cancer_test['Size_UniformityMarginal_Adhesion'] = cancer_test['Size_Uniformity'] * cancer_test['Marginal_Adhesion']
cancer_test['Size_UniformityRoot'] = cancer_test['Size_Uniformity'] ** 0.5
cancer_test['Size_UniformitySquared'] = cancer_test['Size_Uniformity'] ** 2
cancer_test['Size_UniformityCubed'] = cancer_test['Size_Uniformity'] ** 3
cancer_test['Shape_UniformityMarginal_Adhesion'] = cancer_test['Shape_Uniformity'] * cancer_test['Marginal_Adhesion']
cancer_test['Shape_UniformityRoot'] = cancer_test['Shape_Uniformity'] ** 0.5
cancer_test['Shape_UniformitySquared'] = cancer_test['Shape_Uniformity'] ** 2
cancer_test['Shape_UniformityCubed'] = cancer_test['Shape_Uniformity'] ** 3
cancer_test['Marginal_AdhesionRoot'] = cancer_test['Marginal_Adhesion'] ** 0.5
cancer_test['Marginal_AdhesionSquared'] = cancer_test['Marginal_Adhesion'] ** 2
cancer_test['Marginal_AdhesionCubed'] = cancer_test['Marginal_Adhesion'] ** 3
                             
Xtest2 = cancer_test[['Clump_Thickness','Size_Uniformity','Shape_Uniformity',
                     'Clump_ThicknessShape_Uniformity','Clump_ThicknessMarginal_Adhesion',
                     'Clump_ThicknessSize_Uniformity','Size_UniformityShape_Uniformity', 'Size_UniformityMarginal_Adhesion',
                     'Shape_UniformityRoot','Shape_UniformitySquared','Shape_UniformityCubed','Shape_UniformityMarginal_Adhesion',
                     'Size_UniformityRoot','Size_UniformitySquared','Size_UniformityCubed',
                     'Clump_ThicknessRoot','Clump_ThicknessSquared','Clump_ThicknessCubed',
                     'Marginal_AdhesionRoot', 'Marginal_AdhesionSquared', 'Marginal_AdhesionCubed'
                    ]]
print('\nThe R-squared complex model is:')
print(regrBig.score(Xtest2, YTest))



The R-squared simple model is:
0.8051017485158325

The R-squared complex model is:
0.8990214288410103


Ridge Logistic Regression

In [44]:
ridgeregr = linear_model.Ridge(alpha=10, fit_intercept=False)
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))
parameters = ridgeregr.coef_[0]
print(parameters)

ridgeregrBig = linear_model.Ridge(alpha=10, fit_intercept=False)
ridgeregrBig.fit(Xtrain2, Y_train)
print(ridgeregrBig.score(Xtrain2, Y_train))
print('\nSmall vs Large Model Estimates')
comparison = np.array2string(compare,
                               formatter = {'float_kind':'{0:.3f}'.format})
print(comparison)

0.7109144388537617
0.012240813472288292
0.02341640918492205

Small vs Large Model Estimates
[[0.053 -1.653]
 [0.026 0.086]
 [0.050 2.638]
 [0.034 -0.004]
 [-0.210 0.709]]


In [45]:
print(ridgeregr.score(XTest, YTest))
print(ridgeregrBig.score(Xtest2, YTest))

0.7540380640657215
-0.0913801035338424


Lasso Logistic Regression

In [46]:
lass = linear_model.Lasso(alpha=3)
lassfit = lass.fit(X_train, Y_train)
print('R² for the model with a small amount of features:')
print(lass.score(X_train, Y_train))
parameters = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with a small amount of features:')
print(parameters)

lassBig = linear_model.Lasso(alpha=3)
lassBig.fit(Xtrain2, Y_train)
print('\nR² for the model with a large amount of features:')
print(lassBig.score(Xtrain2, Y_train))
parameters = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with a large amount of features:')
print(parameters)

R² for the model with a small amount of features:
0.0

Parameter estimates for the model with a small amount of features:
[0.         0.         0.         0.         0.33969466]

R² for the model with a large amount of features:
0.0060633228643106616

Parameter estimates for the model with a large amount of features:
[-0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  8.58284804e-05  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -9.91221417e-05 -0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  3.41979869e-01]


In [47]:
print(lass.score(XTest, YTest))
print(lassBig.score(Xtest2, YTest))

-0.08835096914863572
-0.11854943117797512


Summary

Vanilla logistic Regression worked the best for this model because it generated the highest R-squared score. Based on the results, Ridge regression did not fit well with this data set. As for the lasso, the cancer model did not fit well due to low amount of numbers in the variables. Regression is useful when it comes to forecasting. However, outliers and regression do not blend well together.