In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
dataset = pd.read_csv('breastdata.csv',names=['id','thickness','size_uniformity',
                                                            'shape_uniformity','adhesion','cellsize',
                                                            'nuclei','chromatin','nucleoli','mitoses',
                                                            'type'])
dataset = dataset.drop('id',axis=1)

In [3]:
#data cleaning
#nuclei attribute has some data which contains '?'
dataset.loc[dataset['nuclei']=='?','nuclei'] = np.nan

In [4]:
dataset = dataset.dropna()
dataset['nuclei'] = dataset['nuclei'].astype('int')

In [5]:
dataset.head()

Unnamed: 0,thickness,size_uniformity,shape_uniformity,adhesion,cellsize,nuclei,chromatin,nucleoli,mitoses,type
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 10 columns):
thickness           683 non-null int64
size_uniformity     683 non-null int64
shape_uniformity    683 non-null int64
adhesion            683 non-null int64
cellsize            683 non-null int64
nuclei              683 non-null int32
chromatin           683 non-null int64
nucleoli            683 non-null int64
mitoses             683 non-null int64
type                683 non-null int64
dtypes: int32(1), int64(9)
memory usage: 56.0 KB


In [7]:
dataset.describe()

Unnamed: 0,thickness,size_uniformity,shape_uniformity,adhesion,cellsize,nuclei,chromatin,nucleoli,mitoses,type
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [None]:
#sns.pairplot(dataset)

In [8]:
def test_train_split(dataset, test_size = 0.25):
    train_size = 1-test_size
    #Separation of values for statified dataset
    truedf = dataset[dataset.iloc[:,-1] == 2]
    falsedf = dataset[dataset.iloc[:,-1] == 4]
    
    #contatinating 75% of true and flase data for train set and remaining for test set
    train_set = pd.concat([truedf[0:int(truedf.count()[0]*train_size)],falsedf[0:int(falsedf.count()[0]*train_size)]])
    test_set = pd.concat([truedf[int(truedf.count()[0]*train_size):],falsedf[int(falsedf.count()[0]*train_size):]])
    
    #X_train = train.drop(train.columns[-1], axis=1)
    #y_train = train.drop(train.columns[:len(df.columns)-1], axis=1)
    #X_test = test.drop(test.columns[-1], axis=1)
    #y_test = test.drop(test.columns[:len(df.columns)-1], axis=1)
    #return X_train,y_train,X_test,y_test
    
    return train_set,test_set

train,test = test_train_split(dataset)

In [10]:
train_true_mean = train[train.iloc[:,-1] == 2].mean().values[0:-1]
train_true_var = train[train.iloc[:,-1] == 2].var().values[0:-1]
train_false_mean = train[train.iloc[:,-1] == 4].mean().values[0:-1]
train_false_var = train[train.iloc[:,-1] == 4].var().values[0:-1]
#(train_true_mean,train_true_var,train_false_mean,train_false_var )

In [11]:
p_true = (train[train.iloc[:,-1] == 2].count()/train.count())[0]
p_false = 1 - p_true

In [12]:
def probability(x,mean,var):
    p = 1/(np.sqrt(2*np.pi*var)) * np.exp((-(x-mean)**2)/(2*var))
    return np.prod(p)

In [13]:
def argmax_probability(data):
    #for true
    y_new_true = probability(data,train_true_mean,train_true_var)* p_true
    
    #for false
    y_new_false = probability(data,train_false_mean,train_false_var)* p_false
    
    if (y_new_true>y_new_false):
        return 2
    else:
        return 4

In [14]:
predictions = []
for index, row in test.iterrows():
    data = row.values[:-1]
    predictions.append(argmax_probability(data))
#print(len(predictions))

In [15]:
predicted = np.array(predictions)
actual = test['type'].values

In [16]:
from sklearn.metrics import confusion_matrix,classification_report

In [17]:
print(confusion_matrix(actual,predicted))

[[108   3]
 [  1  59]]


In [18]:
print(classification_report(actual,predicted))

              precision    recall  f1-score   support

           2       0.99      0.97      0.98       111
           4       0.95      0.98      0.97        60

   micro avg       0.98      0.98      0.98       171
   macro avg       0.97      0.98      0.97       171
weighted avg       0.98      0.98      0.98       171

