In [68]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
blogs = pd.read_csv('../data/blog-gender-dataset_csv.csv')

In [156]:
cv = CountVectorizer(analyzer='char', binary=True, ngram_range=(2, 7))

In [157]:
char_fit = cv.fit_transform(blogs.Blog.astype(str))

In [158]:
from pickle import load, dump

In [159]:
with open('../data/feature_dumps/both_binary_and_scaled.pkl', 'rb') as pkldump:
    features = load(pkldump)

In [160]:
from scipy.sparse import hstack

In [161]:
new_features = hstack((features, char_fit), format='csr')

In [162]:
features.shape

(3212, 52542)

In [163]:
new_features.shape

(3212, 3264525)

In [164]:
with open('../data/feature_dumps/both_binary_and_scaled_with_char_ngrams.pkl', 'wb') as pkldump:
    dump(new_features, pkldump)

In [144]:
with open('../data/feature_dumps/both_binary_and_scaled_with_char_ngrams.pkl', 'rb') as pkldump:
    new_features = load(pkldump)

In [145]:
from sklearn.feature_selection import GenericUnivariateSelect, chi2

In [165]:
selector = GenericUnivariateSelect(chi2, 'k_best', param=50000)

In [166]:
selected_features = selector.fit_transform(new_features, blogs.Gender)

In [112]:
with open('../data/feature_dumps/both_binary_and_scaled_with_char_ngrams_50k_best.pkl', 'wb') as pkldump:
    dump(selected_features, pkldump)

In [148]:
selected_features.shape

(3212, 50000)

In [10]:
from sklearn.neural_network import MLPClassifier

In [11]:
from sklearn.model_selection import cross_val_score

In [21]:
mlp = MLPClassifier(hidden_layer_sizes=(95, 95), activation='identity', early_stopping=True, solver='adam', 
                    max_iter=2500, learning_rate='constant')

In [22]:
score = cross_val_score(mlp, selected_features, blogs.Gender, cv=10, n_jobs=1)

In [23]:
print("Score = ", score)
print("Average score = ", np.mean(score))

Score =  [0.86996904 0.92211838 0.84423676 0.81931464 0.894081   0.79127726
 0.82242991 0.87227414 0.86915888 0.82242991]
Average score =  0.8527289912521822


In [24]:
from joblib import dump as jdump

In [25]:
jdump(mlp, '../models/mlp_v4_2')

['../models/mlp_v4_2']

In [15]:
from itertools import product
from sklearn.model_selection import GridSearchCV

n_layers = list(range(75, 105, 10))
n_units = list(range(75, 105, 10))
depth = list(product(n_layers, n_units))
activations = ['relu', 'logistic', 'tanh', 'identity']
solvers = ['adam']
learning_rates = ['constant']
max_iters = [400, 1000, 1600, 2000]
early_stopping = [True]
params_grid = {
    'hidden_layer_sizes': depth,
    'activation': activations,
    'solver': solvers,
    'learning_rate': learning_rates,
    'max_iter': max_iters,
    'early_stopping': early_stopping
}

In [16]:
grid_search = GridSearchCV(MLPClassifier(), params_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_result = grid_search.fit(selected_features, blogs.Gender)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 60.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 159.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 296.8min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 467.6min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 677.1min finished


In [17]:
print(grid_result.best_params_)
print(grid_result.best_estimator_)
print(grid_result.best_score_)

{'activation': 'identity', 'early_stopping': True, 'hidden_layer_sizes': (95, 95), 'learning_rate': 'constant', 'max_iter': 2000, 'solver': 'adam'}
MLPClassifier(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(95, 95), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
0.8642590286425903


In [26]:
from sklearn.model_selection import train_test_split

In [167]:
btr, bte, gtr, gte = train_test_split(selected_features, blogs.Gender, test_size=0.1, stratify=blogs.Gender, shuffle=True)

In [28]:
mlp = MLPClassifier(hidden_layer_sizes=(95, 95), activation='identity', early_stopping=True, solver='adam', 
                    max_iter=2500, learning_rate='constant')

In [29]:
score = cross_val_score(mlp, btr, gtr, cv=10, n_jobs=1)
print("Score = ", score)
print("Average score = ", np.mean(score))

Score =  [0.9        0.88965517 0.85517241 0.85467128 0.86851211 0.82352941
 0.87889273 0.88888889 0.85416667 0.83333333]
Average score =  0.8646822011427966


In [31]:
mlp.fit(btr, gtr)

MLPClassifier(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(95, 95), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [32]:
mlp.score(bte, gte)

0.8571428571428571

In [33]:
jdump(mlp, '../models/mlp_v4_3')

['../models/mlp_v4_3']

In [34]:
scores = mlp.score(bte, gte)

In [35]:
predictions = mlp.predict(bte)

In [38]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [39]:
confusion_matrix(gte, predictions)

array([[127,  27],
       [ 19, 149]])

In [41]:
print("Male precision = ", precision_score(gte, predictions, pos_label='M'))
print("Female precision = ", precision_score(gte, predictions, pos_label='F'))

Male precision =  0.8465909090909091
Female precision =  0.8698630136986302


In [42]:
print("Male recall = ", recall_score(gte, predictions, pos_label='M'))
print("Female recall = ", recall_score(gte, predictions, pos_label='F'))

Male recall =  0.8869047619047619
Female recall =  0.8246753246753247


In [43]:
print("Male F1 = ", f1_score(gte, predictions, pos_label='M'))
print("Female F1 = ", f1_score(gte, predictions, pos_label='F'))

Male F1 =  0.8662790697674417
Female F1 =  0.8466666666666668


In [48]:
print("Global Precision", precision_score(gte, predictions, average='micro'))
print("Global Recall", recall_score(gte, predictions, average='micro'))
print("Global F1 = ", f1_score(gte, predictions, average='micro'))

Global Precision 0.8571428571428571
Global Recall 0.8571428571428571
Global F1 =  0.8571428571428571


In [168]:
btr2, bv, gtr2, gv = train_test_split(btr, gtr, test_size=0.18, stratify=gtr, shuffle=True)

In [169]:
mlp = MLPClassifier(hidden_layer_sizes=(75, 75), activation='identity', early_stopping=True, solver='adam', 
                    max_iter=2500, learning_rate='constant')

In [170]:
mlp.fit(btr2, gtr2)

MLPClassifier(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(75, 75), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [171]:
mlp.score(bv, gv)

0.8733205374280231

In [172]:
mlp.score(bte, gte)

0.8726708074534162

In [111]:
jdump(mlp, '../models/mlp_v6')

['../models/mlp_v6']

In [173]:
score = cross_val_score(mlp, btr, gtr, cv=10, n_jobs=1)
print("Score = ", score)
print("Average score = ", np.mean(score))

Score =  [0.89655172 0.86206897 0.87586207 0.89965398 0.8650519  0.85467128
 0.87543253 0.90972222 0.86805556 0.89583333]
Average score =  0.8802903558313115
