In [4]:
import pandas as pd
import sklearn

In [5]:
training_dataset = pd.read_csv("../features/data/train_dataset.csv")

In [6]:
training_dataset.shape

(78, 11)

In [4]:
training_dataset.head(5)

Unnamed: 0.1,Unnamed: 0,user,notifications_viewed_three_weeks_ago,notifications_viewed_four_weeks_ago,notifications_viewed_two_weeks_ago,notifications_viewed_one_week_ago,notifications_viewed_zero_weeks_ago,v1,simulado_delayed_amount,simulado_incompleted_amount,notes_delayed_amount
0,0,11,0,0,9,0,9,0.0,4.0,52.0,152.0
1,1,12,5,5,1,1,0,4.0,0.0,0.0,1.0
2,2,13,9,4,2,6,1,6.5,0.0,0.0,10.0
3,3,14,6,3,6,1,3,6.5,0.0,0.0,0.0
4,4,15,7,0,12,3,1,6.0,0.0,0.0,38.0


# Cleaning data for classification

In [7]:
training_dataset["passed"] = (training_dataset["v1"] >= 7.0) * 1

# Analysis by label

In [22]:
training_dataset.passed.value_counts()

0    49
1    29
Name: passed, dtype: int64

# First model - Logistic regression
It is the simplest of the models we might use and can provide great performance given the nature of numerical variables
available. 

In [51]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [9]:
feature_columns = ["simulado_delayed_amount", "simulado_incompleted_amount", "notes_delayed_amount",\
                   "notifications_viewed_one_week_ago", "notifications_viewed_zero_weeks_ago",\
                  "notifications_viewed_two_weeks_ago", "notifications_viewed_three_weeks_ago", "notifications_viewed_four_weeks_ago"]
label_column = ["passed"]


In [10]:
kf = KFold(n_splits=5)

In [19]:
lr = LogisticRegression(solver = 'lbfgs')
dt = DecisionTreeClassifier()

In [28]:
def apply_k_fold_and_return_classification_results(model, training_dataset, feature_columns, label_column):
    classification_results = []
    X = training_dataset[feature_columns].values
    Y = training_dataset[label_column].values
    for train_index, test_index in kf.split(training_dataset):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        clf = model.fit(X_train, y_train.ravel())
        classification_results.append(clf.score(X_test, y_test.ravel()))
    return classification_results

In [32]:
feature_columns = ["simulado_delayed_amount", "simulado_incompleted_amount", "notes_delayed_amount",]
print("logistic regression: ")
print(apply_k_fold_and_return_classification_results(lr, training_dataset, feature_columns, label_column))
print("decision tree:")
print(apply_k_fold_and_return_classification_results(dt, training_dataset, feature_columns, label_column))

logistic regression: 
[0.5, 0.5625, 0.5, 0.6, 0.4]
decision tree:
[0.5625, 0.75, 0.625, 0.4666666666666667, 0.2]


In [33]:
feature_columns = ["simulado_delayed_amount", "simulado_incompleted_amount", "notes_delayed_amount",\
                  "notifications_viewed_one_week_ago", "notifications_viewed_zero_weeks_ago",]
print("logistic regression: ")
print(apply_k_fold_and_return_classification_results(lr, training_dataset, feature_columns, label_column))
print("decision tree:")
print(apply_k_fold_and_return_classification_results(dt, training_dataset, feature_columns, label_column))

logistic regression: 
[0.5625, 0.6875, 0.6875, 0.8666666666666667, 0.3333333333333333]
decision tree:
[0.4375, 0.375, 0.6875, 0.4, 0.5333333333333333]


In [36]:
feature_columns = ["simulado_delayed_amount", "simulado_incompleted_amount", "notes_delayed_amount",\
                  "notifications_viewed_one_week_ago"]
print("logistic regression: ")
print(apply_k_fold_and_return_classification_results(lr, training_dataset, feature_columns, label_column))
print("decision tree:")
print(apply_k_fold_and_return_classification_results(dt, training_dataset, feature_columns, label_column))

logistic regression: 
[0.5625, 0.625, 0.6875, 0.8, 0.6]
decision tree:
[0.4375, 0.625, 0.625, 0.4, 0.6]


In [39]:
feature_columns = ["simulado_delayed_amount", "simulado_incompleted_amount", "notes_delayed_amount",\
                  "notifications_viewed_zero_weeks_ago"]
print("logistic regression: ")
print(apply_k_fold_and_return_classification_results(lr, training_dataset, feature_columns, label_column))
print("decision tree:")
print(apply_k_fold_and_return_classification_results(dt, training_dataset, feature_columns, label_column))

logistic regression: 
[0.5, 0.5625, 0.5, 0.5333333333333333, 0.26666666666666666]
decision tree:
[0.5, 0.5, 0.625, 0.26666666666666666, 0.4666666666666667]


In [40]:
feature_columns = ["simulado_delayed_amount", "simulado_incompleted_amount", "notes_delayed_amount",\
                  "notifications_viewed_one_week_ago"]
print("logistic regression: ")
print(apply_k_fold_and_return_classification_results(lr, training_dataset, feature_columns, label_column))
print("decision tree:")
print(apply_k_fold_and_return_classification_results(dt, training_dataset, feature_columns, label_column))

logistic regression: 
[0.5625, 0.625, 0.6875, 0.8, 0.6]
decision tree:
[0.4375, 0.625, 0.625, 0.4666666666666667, 0.6]


In [50]:
feature_columns = ["simulado_delayed_amount", "simulado_incompleted_amount", "notes_delayed_amount",\
                  "notifications_viewed_zero_weeks_ago", "notifications_viewed_one_week_ago"]
print("logistic regression: ")
print(apply_k_fold_and_return_classification_results(lr, training_dataset, feature_columns, label_column))
print("decision tree:")
print(apply_k_fold_and_return_classification_results(dt, training_dataset, feature_columns, label_column))

logistic regression: 
[0.5625, 0.6875, 0.6875, 0.8666666666666667, 0.3333333333333333]
decision tree:
[0.4375, 0.375, 0.6875, 0.4, 0.5333333333333333]
