In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import scipy.stats
# import math

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.linear_model import Lasso
# from sklearn.linear_model import Ridge
from sklearn.preprocessing import Imputer
from sklearn.svm import LinearSVR
# from sklearn.svm import SVR
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import r2_score

# %matplotlib inline

From the dataset documentation, columns are labeled like so:

1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)

In [42]:
url_path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
column_names = ['id', 'clump_thickness', 'size_uniformity', 'shape_uniformity', 'marginal_adhesion',
                'epithel_size', 'bare_nuclei', 'bland_chroma', 'normal_nucl', 'mitoses', 'malignant']
df_raw = pd.read_csv(url_path, header=None, names=column_names)
df_raw.head()

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithel_size,bare_nuclei,bland_chroma,normal_nucl,mitoses,malignant
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [43]:
df_raw.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,699.0,1071704.0,617095.729819,61634.0,870688.5,1171710.0,1238298.0,13454352.0
clump_thickness,699.0,4.41774,2.815741,1.0,2.0,4.0,6.0,10.0
size_uniformity,699.0,3.134478,3.051459,1.0,1.0,1.0,5.0,10.0
shape_uniformity,699.0,3.207439,2.971913,1.0,1.0,1.0,5.0,10.0
marginal_adhesion,699.0,2.806867,2.855379,1.0,1.0,1.0,4.0,10.0
epithel_size,699.0,3.216023,2.2143,1.0,2.0,2.0,4.0,10.0
bland_chroma,699.0,3.437768,2.438364,1.0,2.0,3.0,5.0,10.0
normal_nucl,699.0,2.866953,3.053634,1.0,1.0,1.0,4.0,10.0
mitoses,699.0,1.589413,1.715078,1.0,1.0,1.0,1.0,10.0
malignant,699.0,2.689557,0.951273,2.0,2.0,2.0,4.0,4.0


In [50]:
# Check for imbalanced classes. We are good here.
df_raw.malignant.value_counts()

2    458
4    241
Name: malignant, dtype: int64

In [35]:
# Check for missing data. We are good here, too.
df_raw.isnull().sum()

id                   0
clump_thickness      0
size_uniformity      0
shape_uniformity     0
marginal_adhesion    0
Epithel_size         0
bare_nuclei          0
bland_chroma         0
normal_nucl          0
mitoses              0
malignant            0
dtype: int64

In [55]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                   699 non-null int64
clump_thickness      699 non-null int64
size_uniformity      699 non-null int64
shape_uniformity     699 non-null int64
marginal_adhesion    699 non-null int64
epithel_size         699 non-null int64
bare_nuclei          699 non-null object
bland_chroma         699 non-null int64
normal_nucl          699 non-null int64
mitoses              699 non-null int64
malignant            699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [56]:
df_raw.bare_nuclei.value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

The dataset documentation said there were some question marks in the data! Since we don't have a ton of data, let's impute them.

In [63]:
# Make new dataframe for feature creation and modification
df_features = df_raw.copy()

# Impute missing data using mean
imp = Imputer(missing_values='?', strategy='median', axis=1)
df_features.loc[:, 'bare_nuclei']

# Modify class index to 0,1 instead of 2,4
df_features.loc[:, 'malignant'] = ((df_raw['malignant'] / 2) -1).astype(int)
df_features.head()

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithel_size,bare_nuclei,bland_chroma,normal_nucl,mitoses,malignant
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [64]:
df_features.bare_nuclei.value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

It looks like this dataset is already very clean, but very small. We will have to be sure to avoid overfitting, so I think it would be best to pick our best algortithm using cross validation.

Let's explore some features, shall we?

In [54]:
# Make a copy of the data to add jitter to and plot.
df_jittered = df_features.loc[:, 'clump_thickness':'mitoses']
# Making the random noise.
jitter = pd.DataFrame(
    np.random.uniform(-.3, .3, size=(df_jittered.shape)),
    columns=df_jittered.columns
)
# Combine the data and the noise.
df_jittered = df_jittered.add(jitter)

# Declare that you want to make a scatterplot matrix.
g = sns.PairGrid(df_jittered.dropna(), diag_sharey=False)
# Scatterplot.
g.map_upper(plt.scatter, alpha=.5)
# Fit line summarizing the linear relationship of the two variables.
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
# Give information about the univariate distributions of the variables.
g.map_diag(sns.kdeplot, lw=3)
plt.show()

TypeError: must be str, not float

FLow Chart: Linear SVC, then KNeighbors, then SVC and Ensemble methods

In [None]:
r2_scores = []
best_i = None

i_start, i_stop, i_step = .05, 2, .05
for i in np.arange(i_start, i_stop, i_step):
    lin_svr = LinearSVR(C=i)
    lin_svr.fit(X_train_new, y_train)
    y_pred = lin_svr.predict(X_test_new)
    r2_scores.append(r2_score(y_test, y_pred))

best_i = np.arange(i_start, i_stop, i_step)[np.argmax(r2_scores)]
best_r2 = max(r2_scores)

plt.plot(np.arange(i_start, i_stop, i_step), r2_scores)
plt.axvline(best_i, color='red')
plt.axhline(best_r2, color='red')
plt.title(r'$R^2$ vs. $\alpha$ Value')
plt.xlabel(r'$\alpha$ Value')
plt.ylabel('$R^2$')
plt.show()

print(r'Best value of R^2: {:.3f}'.format(best_r2))