In [23]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import sklearn
from sklearn import datasets, linear_model, metrics
from math import sqrt
from sklearn.model_selection import KFold, cross_val_predict, train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import classification_report
import scipy
import scipy.stats as stats

In [None]:
# Part 1.1
#Accuracy could be described as how close we are to the centre of a target, where as Precision would
#describe how well grouped our results are. So we could have low accuracy and high presicion, and vice versa,
#but still have an inaccurate model. We need high precision and accuracy, if possible.

In [None]:
# Part 1.2
# Precision and Recall are the measure of which instances are relevant. Precision describes the relevant instances
# from the whole set, where as Recall describes the relevant instances from only the relevant instances.
# F1 = 2 * (precision * recall)/(precision + recall) closer to 1 is best

In [2]:
# Part 1
dFrame = pd.read_csv('cancerData.csv', sep=',')

logModel = linear_model.LogisticRegression()
cols = ['radius', 'perimeter', 'concavity']
X = dFrame[cols].values.reshape(-1, len(cols))
Y = dFrame['diagnosis']
stats = ['mal', 'ben']
folds = KFold(n_splits=10)
for train, test in folds.split(X,Y):
    X_Train, X_Test = X[train], X[test]
    Y_Train, Y_Test = Y[train], Y[test]
    

    logModel.fit(X_Train, Y_Train)

    pred = cross_val_predict(logModel, X_Test, Y_Test, cv=10)
    print(classification_report(Y_Test, pred, target_names=stats))

    

             precision    recall  f1-score   support

        mal       1.00      0.09      0.17        11
        ben       0.82      1.00      0.90        46

avg / total       0.86      0.82      0.76        57

             precision    recall  f1-score   support

        mal       0.79      0.94      0.86        35
        ben       0.87      0.59      0.70        22

avg / total       0.82      0.81      0.80        57

             precision    recall  f1-score   support

        mal       0.85      0.97      0.91        36
        ben       0.94      0.71      0.81        21

avg / total       0.88      0.88      0.87        57

             precision    recall  f1-score   support

        mal       0.96      0.90      0.93        29
        ben       0.90      0.96      0.93        28

avg / total       0.93      0.93      0.93        57

             precision    recall  f1-score   support

        mal       0.93      0.93      0.93        29
        ben       0.93      0.93 

In [13]:
# Part 2

bFrame = pd.read_csv('brain_size.csv', sep=';')

heights = bFrame.dropna(subset=['Height'], how='all')
heights.head(5)



Unnamed: 0.1,Unnamed: 0,Gender,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
0,1,Female,133,132,124,118,64.5,816932
1,2,Male,140,150,124,.,72.5,1001121
2,3,Male,139,123,150,143,73.3,1038437
3,4,Male,133,129,128,172,68.8,965353
4,5,Female,137,132,134,147,65.0,951545


In [None]:
# Part 2.1
# The t-test is an inference test. The result tells us if we can infer anything from our results, by comparing two sets
# of means, and telling us the significance of the difference. P value is the probablity that our results happened by c
# chance, in percent. Low values are good, they show that the results weren't just achieved by chance, but are relevant.

In [24]:
# Part 2.2
heights = heights[heights['Height'] !='.']
h_data = heights['Height'].values.astype(float)

stats.ttest_1samp(h_data, 71)

Ttest_1sampResult(statistic=-3.8682665640568583, pvalue=0.00041658142370520256)

In [None]:
#The very low P-value tells us that our results weren't just arrived at by chance. 
# The t-test score of 3 shows that there is a large difference between the two groups, so
# we can say that the height of the people in the sample set is not similar to the population. 

In [25]:
# Part 2.3
stats.ttest_1samp(h_data, 68.4)

Ttest_1sampResult(statistic=0.1964197529935458, pvalue=0.84532834985133909)

In [None]:
# The opposite is true here. We have a high p-value, indicating a high percent of probability, as well as a low
# t-test score. So the sample does match the populations height very closely.

In [30]:
# Part 3
from sklearn.linear_model import Perceptron
logModel = Perceptron()
cols = ['radius', 'perimeter', 'concavity']
X = dFrame[cols].values.reshape(-1, len(cols))
Y = dFrame['diagnosis']
stats = ['mal', 'ben']
accuracies = []
folds = KFold(n_splits=10)
for train, test in folds.split(X,Y):
    X_Train, X_Test = X[train], X[test]
    Y_Train, Y_Test = Y[train], Y[test]
    

    logModel.fit(X_Train, Y_Train)
    accuracies.append(logModel.score(X_Train, Y_Train))
    print('Accuracy : ', logModel.score(X_Train, Y_Train))
    
print('Mean Accuracy : ', np.mean(accuracies))    

Accuracy :  0.890625
Accuracy :  0.90234375
Accuracy :  0.626953125
Accuracy :  0.86328125
Accuracy :  0.640625
Accuracy :  0.609375
Accuracy :  0.6171875
Accuracy :  0.611328125
Accuracy :  0.611328125
Accuracy :  0.824561403509
Mean Accuracy :  0.719760827851
