In [1]:
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn import neighbors
from sklearn.naive_bayes import BernoulliNB
from sklearn.cluster import KMeans
import sklearn.tree
import numpy as np

In [2]:
def _loadData():
    finalList = list()
    
    with open("PYPL.csv") as f:
        for idx,line in enumerate(f):
            if idx==0:
                continue
            
            else:
                tempList = list()
                vals = line.strip().split(",")
                for i in vals:
                    tempList.append(float(i))
                finalList.append(tempList)
                
    return finalList

In [3]:
data = np.array(_loadData())
data.shape

(42512, 442)

In [4]:
X_matrix = data[:, :-1] 
Y = data[:, -1]

In [5]:
### select all rows except last column
X_matrix = data[:, :-1] 
### select last column
Y = data[:, -1]

#Splitting training and testing data (features)
X_train, X_test, y_train, y_test = train_test_split(X_matrix, Y, test_size=0.2, random_state=33)

# Linear Regression

In [6]:
# Linear regression
# Create logistic regression object
regr = linear_model.LogisticRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))



Residual sum of squares: 0.01
Variance score: 0.99


# Decision Tree

In [7]:
### Initiate classifier
infoGain_clf = sklearn.tree.DecisionTreeClassifier(criterion='entropy')

In [8]:
kf = KFold(5, shuffle=True, random_state=33)

recallList = list()
precisionList = list()
f1List = list()
accuracyList = list()

for train_index, test_index in kf.split(X_matrix):
    infoGain_clf_tree = infoGain_clf.fit(X_matrix[train_index], Y[train_index])
    
    test = Y[test_index]
    pred = infoGain_clf.predict(X_matrix[test_index])
    
    #results = calcAccuracy(pred, test)
    
    recallList.append(recall_score(test, pred))
    precisionList.append(precision_score(test, pred))
    f1List.append(f1_score(test,pred))
    accuracyList.append(accuracy_score(test,pred))


print("Mean precision: " + str(np.mean(precisionList)))
print("StDev precision: " + str(np.std(precisionList)))
print("")
print("Mean recall: " + str(np.mean(recallList)))
print("StDev recall: " + str(np.std(recallList)))
print("")
print("Mean f1: " + str(np.mean(f1List)))
print("StDev f1: " + str(np.std(f1List)))
print("")
print("Mean accuracy: " + str(np.mean(accuracyList)))
print("StDev accuracy: " + str(np.std(accuracyList)))

Mean precision: 0.14396141106046242
StDev precision: 0.05064121075838462

Mean recall: 0.12390262935765248
StDev recall: 0.035715113134485166

Mean f1: 0.13288502172424024
StDev f1: 0.04235487226751117

Mean accuracy: 0.9798175041822453
StDev accuracy: 0.0009179653928655635


# Naive Bayes Classifier

In [9]:
### Initialize classifier
infoGain_clf = BernoulliNB()

In [10]:
kf = KFold(5, shuffle=True, random_state=33)

recallList = list()
precisionList = list()
f1List = list()
accuracyList = list()

for train_index, test_index in kf.split(X_matrix):
    infoGain_clf_tree = infoGain_clf.fit(X_matrix[train_index], Y[train_index])
    
    test = Y[test_index]
    pred = infoGain_clf.predict(X_matrix[test_index])
    
    #results = calcAccuracy(pred, test)
    
    recallList.append(recall_score(test, pred))
    precisionList.append(precision_score(test, pred))
    f1List.append(f1_score(test,pred))
    accuracyList.append(accuracy_score(test,pred))


print("Mean precision: " + str(np.mean(precisionList)))
print("StDev precision: " + str(np.std(precisionList)))
print("")
print("Mean recall: " + str(np.mean(recallList)))
print("StDev recall: " + str(np.std(recallList)))
print("")
print("Mean f1: " + str(np.mean(f1List)))
print("StDev f1: " + str(np.std(f1List)))
print("")
print("Mean accuracy: " + str(np.mean(accuracyList)))
print("StDev accuracy: " + str(np.std(accuracyList)))

Mean precision: 0.07095280728754423
StDev precision: 0.005646560472607891

Mean recall: 0.7618823726050982
StDev recall: 0.043849744833308564

Mean f1: 0.1297794849398862
StDev f1: 0.009854809030931135

Mean accuracy: 0.8741296559839826
StDev accuracy: 0.002572011638140312


# K-Means Clustering

In [11]:
kmeans_cluster_algo = KMeans(n_clusters=2)

In [12]:
kmeans_cluster_algo.fit(X_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
import collections
print "Clusters (result of k-means)"
print collections.Counter(kmeans_cluster_algo.labels_) #Print the frequency of elements in a numpy array
print "Ground truth"
print collections.Counter(Y) # ditto

Clusters (result of k-means)
Counter({0: 35496, 1: 7016})
Ground truth
Counter({0.0: 41988, 1.0: 524})
