In [9]:
import pandas as pd
import numpy as np
from pickle import dump, load
from joblib import dump as jdump
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import chi2, GenericUnivariateSelect
from sklearn.metrics import f1_score, precision_score, recall_score, auc
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

In [2]:
features = pd.read_pickle('../data/feature_dumps/scaled_features.pkl')

In [3]:
with open('../data/feature_dumps/word_pos_binary_ngrams_50k_best', 'rb') as pkldump:
    feature1 = load(pkldump)

In [5]:
blogs = pd.read_csv('../data/blog-gender-dataset_csv.csv')

In [6]:
cv = CountVectorizer(analyzer='char', binary=True, ngram_range=(2, 7))
char_fit = cv.fit_transform(blogs.Blog.astype(str))

In [7]:
new_features = hstack((feature1, char_fit), format='csr')

In [8]:
print(features.shape)
print(feature1.shape)
print(new_features.shape)

(3212, 295)
(3212, 50000)
(3212, 3261983)


In [10]:
selector = GenericUnivariateSelect(chi2, 'k_best', param=50000)
selected_features = selector.fit_transform(new_features, blogs.Gender)

In [11]:
selected_features.shape

(3212, 50000)

In [12]:
mlp = MLPClassifier(hidden_layer_sizes=(75, 75), activation='identity', early_stopping=True, solver='adam', 
                    max_iter=2500, learning_rate='constant')

In [13]:
score = cross_val_score(mlp, selected_features, blogs.Gender, cv=10, n_jobs=1)
print("Score = ", score)
print("Average score = ", np.mean(score))

Score =  [0.92879257 0.90031153 0.86604361 0.87850467 0.90654206 0.86604361
 0.90342679 0.91588785 0.90342679 0.87850467]
Average score =  0.8947484158444491


In [14]:
btr, bte, gtr, gte = train_test_split(selected_features, blogs.Gender, test_size=0.1, stratify=blogs.Gender, shuffle=True)
btr2, bv, gtr2, gv = train_test_split(btr, gtr, test_size=0.18, stratify=gtr, shuffle=True)

In [15]:
mlp = MLPClassifier(hidden_layer_sizes=(75, 75), activation='identity', early_stopping=True, solver='adam', 
                    max_iter=2500, learning_rate='constant')
mlp.fit(btr2, gtr2)
print(mlp.score(bv, gv))
print(mlp.score(bte, gte))

0.8714011516314779
0.8757763975155279


In [16]:
score = cross_val_score(mlp, btr, gtr, cv=10, n_jobs=1)
print("Score = ", score)
print("Average score = ", np.mean(score))

Score =  [0.87586207 0.91034483 0.88275862 0.89273356 0.8615917  0.88581315
 0.91695502 0.87152778 0.90972222 0.93402778]
Average score =  0.8941336720624694
