In [8]:
import pandas as pd

ecomm = pd.read_csv("ecommerce_consumers.csv")
ecomm.head()

Unnamed: 0,ratio,time,label
0,0.54,17.2,female
1,0.93,18.2,male
2,0.84,13.6,female
3,0.19,6.0,male
4,0.89,13.2,female


In [9]:
ecomm['label'] = ecomm['label'].map({ 'female': 0, 'male': 1})
ecomm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
ratio    200 non-null float64
time     200 non-null float64
label    200 non-null int64
dtypes: float64(2), int64(1)
memory usage: 4.8 KB


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [11]:
y = ecomm['label']
ind_headers = list(ecomm.columns.values)
ind_headers.remove('label')
X = ecomm[ind_headers]
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7, random_state=100)



In [12]:
""" 
Function that trains the given classifier on training data and comes up with prediction and accuracy score
for the test dataset

@Author Aryan Singh
"""
def train_model(clf, train_x,train_y,test_x,test_y):
    trained_model = clf.fit(train_x, train_y)
    print(trained_model)
    predictions_train = trained_model.predict(train_x)
    predictions = trained_model.predict(test_x)
    for i in range(0, 5):
        print("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))
    print("Train Accuracy :: ", accuracy_score(train_y, predictions_train))
    print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
    print(" Confusion matrix for train dataset", confusion_matrix(train_y, predictions_train))
    print(" Confusion matrix for test dataset", confusion_matrix(test_y, predictions))
    return predictions

In [13]:

"""
 Function that takes in the predictions and labels of test dataset to determine the F-1 score,Function 
sensitivity, specificity, recall and precision for the given dataset.

@Author Aryan Singh
"""
def measure_accuracy(test_y, predictions):
    confusion = confusion_matrix(test_y, predictions)

    TP = confusion[1,1]
    TN = confusion[0,0]
    FP = confusion[0,1]
    FN = confusion[1,0]

    sensitivity = TP/ (TP + FN)
    print("Sensitivity ", sensitivity)

    specificity = TN/(TN + FP)
    print("specificity", specificity)

    precision = TP / (TP + FP)
    print("pricision ", precision)
    print("Precision Score:",precision_score(test_y, predictions))

    print("Recall Score:", recall_score(test_y, predictions))

    print("F1 Score:", f1_score(test_y, predictions))

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

print('=============================Logistic Regression Model================================')
lr = LogisticRegression()
predictions = train_model(lr, X_train, y_train, X_test, y_test)
measure_accuracy(y_test, predictions) 

print('=============================Decision Tree Classifier================================')
dt = DecisionTreeClassifier(min_samples_leaf=10, max_depth=6)
predictions = train_model(dt, X_train, y_train, X_test, y_test)
measure_accuracy(y_test, predictions) 

print('=============================SVC Model================================')
svc = SVC(kernel = 'rbf')
predictions = train_model(svc, X_train, y_train, X_test, y_test)
measure_accuracy(y_test, predictions) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :: 0 and Predicted outcome :: 1
Actual outcome :: 1 and Predicted outcome :: 1
Train Accuracy ::  0.6642857142857143
Test Accuracy  ::  0.7
 Confusion matrix for train dataset [[ 0 47]
 [ 0 93]]
 Confusion matrix for test dataset [[ 0 18]
 [ 0 42]]
Sensitivity  1.0
specificity 0.0
pricision  0.7
Precision Score: 0.7
Recall Score: 1.0
F1 Score: 0.8235294117647058
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_l