In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas as pd
import sklearn
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
import warnings
warnings.filterwarnings('ignore')

dataset_path = "../input/decisiondata3/cleanedData.csv"
df = pd.read_csv(dataset_path)

# Replace pandas' mask with this
# https://stackoverflow.com/a/11872393/1558159
def mask(df, key, value):
    return df[df[key] == value]
pd.DataFrame.mask = mask

def encode_target(df, target_column, dest_col_name):

    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[dest_col_name] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

df.head()

In [28]:
print(', '.join(df.columns))
df.hist(column='SCHOOL', by='ONTASK', bins=50)

In [29]:
schools = df.groupby("SCHOOL")
for school, sdf in schools:
    sdf.hist(column='Activity', by='ONTASK', bins=50)

In [30]:
df, targets = encode_target(df, "ONTASK", "intONTASK")
print("ONTASK", targets)
df, targets = encode_target(df, "SCHOOL", "intSCHOOL")
print("SCHOOL", targets)
df, targets = encode_target(df, "Class", "intClass")
print("Class", targets)
df, targets = encode_target(df, "Activity", "intActivity")
print("Activity", targets)

In [31]:
print(df.columns)
df.head()

In [32]:
features = ["intSCHOOL", "intClass", "intActivity"]
Y = df["intONTASK"]
X = df[features]

In [33]:
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99, max_leaf_nodes=30)
dt.fit(X, Y)

In [34]:
import graphviz
export_graphviz(dt, out_file="DecisionTree2.dot", feature_names=features)
with open("DecisionTree2.dot") as f:
    dot_graph = f.read()

In [35]:
features = ["intSCHOOL", "intClass", "intActivity"]
features_train, features_test, ontask_train, ontask_test = \
    train_test_split(df[features], df['intONTASK'], train_size=0.7, test_size=0.3)

print (len(features_train), len(features_test), len(features_train) + len(features_test))

In [36]:
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99, max_leaf_nodes=20)
dt.fit(features_train, ontask_train)

In [37]:
scores = cross_val_score(dt, features_train, ontask_train, scoring='neg_mean_squared_error', cv=10)
print("Folds: {0}, mean squared error: {1:.2} std: {1:.2}".format(len(scores),np.mean(np.abs(scores)),np.std(scores)))

In [38]:
1+scores.mean(), scores.std()

In [39]:
print(scores)

In [40]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [41]:
%time plot_learning_curve(dt, "accuracy vs. training set size", features_train, ontask_train, cv=4)