# features:
- entity
- entity length
- fast number
- n-gram
- readability

In [21]:
import os
os.getcwd()

'/Users/loewi/Documents/GitHub/text-formality-classifier'

In [22]:
from joblib import load
from time import time
t0 = time()

print("features loading...")

feature_ent = load("feature/ent_feature.jbl")
feature_ngram = load("feature/ngram_feature.jbl")
feature_len = load("feature/entity_length_feature.jbl")
feature_fast = load("feature/fast_num_feature.jbl")
feature_read = load("feature/readability_feature.jbl")
duration = time() - t0
features = [(feature_ent,"entity"),
            (feature_len, "entity_length"),
            (feature_fast, "fast_number"),
            (feature_ngram, "n_gram"),
            (feature_read, "readability")]
for feature, name in features:
    print("feature: {}, dimension: {}".format(name, feature.shape))
    
print("features prepared in %ds\n" % duration)

features loading...
feature: entity, dimension: (209124, 6)
feature: entity_length, dimension: (209124, 1)
feature: fast_number, dimension: (209124, 6)
feature: n_gram, dimension: (209124, 1359848)
feature: readability, dimension: (209124, 2)
features prepared in 0s



In [23]:
print("data loading...")
t0 = time()
sent_dict = load(open("/Users/loewi/Documents/GitHub/text-formality-classifier/preprocessing/corpus_dict.pkl", "rb"))
label = sent_dict["label"]
duration = time() - t0
print("data prepared in %ds\n" % duration)

data loading...
data prepared in 0s



In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.utils import shuffle
from joblib import dump

In [25]:
def get_non_neg_feature(feature):
    scaler = MinMaxScaler()
    feature = scaler.fit_transform(feature)
    return feature

# Naíve Bayes

In [26]:
for feature, name in features:
    print("-"*30)
    print("feature: {}, classifier: Naíve Bayes".format(name))
    if name == 'readability':
        feature = get_non_neg_feature(feature)
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=0)

    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    t0 = time()
    # using cross validation
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    t1 = time() - t0
    print("Training time: %0.3fs" % t1)

    score0 = clf.score(X_test, y_test)
    score = scores.mean()
    print("Accuracy_split: %0.3f" % score0)
    print("Accuracy_cross_validation: %0.3f (+/- %0.3f)" % (score, scores.std()*2))
    path = "model/" + name + '_nb.jbl'
    dump(clf, path)

------------------------------
feature: entity, classifier: Naíve Bayes
Training time: 1.078s
Accuracy_split: 0.537
Accuracy_cross_validation: 0.540 (+/- 0.004)
------------------------------
feature: entity_length, classifier: Naíve Bayes
Training time: 0.913s
Accuracy_split: 0.499
Accuracy_cross_validation: 0.500 (+/- 0.000)
------------------------------
feature: fast_number, classifier: Naíve Bayes
Training time: 0.915s
Accuracy_split: 0.795
Accuracy_cross_validation: 0.797 (+/- 0.006)
------------------------------
feature: n_gram, classifier: Naíve Bayes
Training time: 2.695s
Accuracy_split: 0.679
Accuracy_cross_validation: 0.687 (+/- 0.002)
------------------------------
feature: readability, classifier: Naíve Bayes
Training time: 0.910s
Accuracy_split: 0.535
Accuracy_cross_validation: 0.536 (+/- 0.004)


# Logistic Regression

In [27]:
for feature, name in features:
    print("-"*30)
    print("feature: {}, classifier: Logistic Regression".format(name))
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=0)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    t0 = time()
    # using cross validation
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    t1 = time() - t0
    print("Training time: %0.3fs" % t1)

    score0 = clf.score(X_test, y_test)
    score = scores.mean()
    print("Accuracy_split: %0.3f" % score0)
    print("Accuracy_cross_validation: %0.3f (+/- %0.3f)" % (score, scores.std()*2))
    path = "model/" + name + '_lr.jbl'
    dump(clf, path)

------------------------------
feature: entity, classifier: Logistic Regression
Training time: 1.292s
Accuracy_split: 0.537
Accuracy_cross_validation: 0.541 (+/- 0.003)
------------------------------
feature: entity_length, classifier: Logistic Regression
Training time: 1.085s
Accuracy_split: 0.523
Accuracy_cross_validation: 0.526 (+/- 0.002)
------------------------------
feature: fast_number, classifier: Logistic Regression
Training time: 2.115s
Accuracy_split: 0.804
Accuracy_cross_validation: 0.808 (+/- 0.004)
------------------------------
feature: n_gram, classifier: Logistic Regression
Training time: 104.841s
Accuracy_split: 0.728
Accuracy_cross_validation: 0.729 (+/- 0.003)
------------------------------
feature: readability, classifier: Logistic Regression
Training time: 1.394s
Accuracy_split: 0.569
Accuracy_cross_validation: 0.570 (+/- 0.005)


# Decision Tree

In [28]:
for feature, name in features:
    print("-"*30)
    print("feature: {}, classifier: Decision Tree".format(name))
    
    if name == 'n_gram':
        continue
        
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=0)
    
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    t0 = time()
    # using cross validation
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    t1 = time() - t0
    print("Training time: %0.3fs" % t1)

    score0 = clf.score(X_test, y_test)
    score = scores.mean()
    print("Accuracy_split: %0.3f" % score0)
    print("Accuracy_cross_validation: %0.3f (+/- %0.3f)" % (score, scores.std()*2))
    path = "model/" + name + '_dt.jbl'
    dump(clf, path)

------------------------------
feature: entity, classifier: Decision Tree
Training time: 0.615s
Accuracy_split: 0.539
Accuracy_cross_validation: 0.544 (+/- 0.003)
------------------------------
feature: entity_length, classifier: Decision Tree
Training time: 0.590s
Accuracy_split: 0.523
Accuracy_cross_validation: 0.526 (+/- 0.002)
------------------------------
feature: fast_number, classifier: Decision Tree
Training time: 1.115s
Accuracy_split: 0.801
Accuracy_cross_validation: 0.803 (+/- 0.004)
------------------------------
feature: n_gram, classifier: Decision Tree
------------------------------
feature: readability, classifier: Decision Tree
Training time: 0.826s
Accuracy_split: 0.600
Accuracy_cross_validation: 0.598 (+/- 0.004)
