In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2, GenericUnivariateSelect
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from pickle import dump, load
from scipy.sparse import hstack, vstack

In [8]:
with open('../data/feature_dumps/scaled_features.pkl', 'rb') as pkldump:
    features = load(pkldump)

In [9]:
features = features.drop(columns=['NN', 'NNPS', 'VBD', 'VBZ', 'MD',
       'EX', 'IN', 'VB', 'JJR', 'JJS', 'PRP', 'WDT', 'JJ', 'VBP', 'NNS',
       'VBN', 'DT', 'RB', 'WP', 'VBG', 'NNP', 'RBR', 'PRP$', 'JJ NN',
       'VBP VB', 'VBD PRP', 'IN IN', 'NNP VBZ', 'RB DT', 'NN VBG',
       'IN JJ', 'NN NN', 'RB VBZ', 'VBG DT', 'NN NNS', 'VBZ JJ', 'IN RB',
       'JJ JJ', 'NN VBZ', 'IN VBG', 'VBP DT', 'VB NN', 'NNS VBP',
       'DT NNP', 'PRP VBZ', 'PRP VBD', 'PRP VB', 'NN PRP', 'NN DT',
       'VBZ VB', 'PRP VBP', 'PRP$ JJ', 'VBD IN', 'VB JJ', 'NN JJ',
       'RB VB', 'JJ NNP', 'RB VBG', 'VBZ PRP', 'VBD DT', 'RB RB', 'JJ VB',
       'PRP RB', 'JJ IN', 'VBD VB', 'VB IN', 'VBP PRP', 'VBD RB',
       'VBG IN', 'PRP IN', 'VB PRP', 'NN RB', 'NNP NN', 'VB VBN',
       'NN NNP', 'IN NN', 'VBP VBN', 'NN WDT', 'RB IN', 'DT JJ', 'RB VBD',
       'VBZ IN', 'NN MD', 'VB DT', 'NNS IN', 'NNP RB', 'VB PRP$',
       'VBP IN', 'RB VBP', 'NNS RB', 'DT NN', 'VBZ VBN', 'MD RB',
       'NNP IN', 'NN VBD', 'JJ NNS', 'NN VB', 'IN PRP$', 'MD VB',
       'RB PRP', 'NNS VB', 'VBZ DT', 'VBG NN', 'VBN IN', 'PRP$ NNS',
       'VB VB', 'VBP RB', 'NNP NNP', 'NN IN', 'VB RB', 'VBG PRP',
       'PRP MD', 'IN DT', 'NNP VBD', 'IN NNS', 'IN NNP', 'RB JJ',
       'IN PRP', 'VBD JJ', 'RB VBN', 'DT NNS', 'VBD VBN', 'PRP$ NN',
       'VBZ RB', 'VBP JJ', 'DT NN VBZ', 'PRP VBD IN', 'JJ NNP NN',
       'DT NNP NNP', 'RB IN PRP', 'IN JJ NN', 'JJ NN VB', 'IN NN IN',
       'IN DT NNS', 'NNS IN PRP', 'NN IN NNS', 'VBN IN DT', 'PRP VBP JJ',
       'DT NNP NN', 'VBZ DT NN', 'PRP VBP RB', 'IN DT JJ', 'MD RB VB',
       'JJ NN IN', 'RB DT NN', 'VBG DT NN', 'NNP NN NN', 'DT NNS IN',
       'DT NN IN', 'JJ NN NN', 'VB DT NN', 'DT JJ NNS', 'NN IN PRP$',
       'IN JJ NNS', 'NNS IN NN', 'VBZ DT JJ', 'PRP VBP DT', 'NNP NNP NN',
       'PRP VBP PRP', 'VB DT JJ', 'VB PRP$ NN', 'DT JJ JJ', 'IN PRP VBP',
       'DT NN VB', 'IN NNP NNP', 'NNS IN DT', 'VB IN DT', 'IN DT NNP',
       'NN IN NNP', 'IN PRP$ NN', 'DT JJ NN', 'NN NN IN', 'IN NN NN',
       'VBP DT NN', 'PRP MD VB', 'PRP$ NN NN', 'RB JJ NN', 'DT NN NN',
       'IN PRP$ JJ', 'VBD DT JJ', 'PRP VBP VB', 'NN NN NN', 'VBD DT NN',
       'NNP NNP NNP', 'PRP VBD VB', 'JJ NNS IN', 'IN DT NN', 'NN IN NN',
       'PRP VBP IN', 'PRP VBD DT', 'IN PRP VBD', 'NN IN JJ', 'DT NN RB',
       'NN IN PRP', 'NN IN DT', 'RB IN DT', 'JJ JJ NN', 'PRP$ NN IN',
       'RB PRP VBP', 'PRP$ JJ NN', 'IN DT JJ NN', 'NN IN DT NN',
       'NN IN PRP$ NN', 'NN IN DT JJ', 'IN DT NN VB', 'DT JJ NN IN',
       'IN DT NN NN', 'DT JJ NN NN', 'VB DT JJ NN', 'IN DT NNP NN',
       'DT NN IN PRP', 'JJ NN IN DT', 'DT NN IN NN', 'NNS IN DT NN',
       'VB DT NN IN', 'NN IN JJ NN', 'DT NN IN DT', 'JJ NN IN NN',
       'IN DT NN IN', 'DT NN IN DT NN', 'NN IN DT JJ NN',
       'DT JJ NN IN NN', 'JJ NN IN DT NN', 'UpperCaseChars',
       'UpperCaseWords', 'TitleCaseWords', 'DT VBZ', 'DT RB', 'VBD NN',
       'NNP VB', 'IN NNP NN', 'JJ NNS VB', 'PRP$ NN VB', 'VBN IN NN',
       'NN NN VB', 'PRP RB VB', 'VBZ NN', 'DT IN', 'DT VB', 'RB NN',
       'WP VB', 'VBP NN', 'DT DT', 'VBN NN', 'IN VB', 'WDT VB',
       'IN PRP VB', 'NNP NNP VB', 'NN IN VB', 'RB PRP VB', 'NN PRP VB'])

In [10]:
gender = features.Gender.values
features = features.drop(columns=['Gender'])

In [11]:
btr, bte, gtr, gte = train_test_split(features, gender, test_size=0.25, shuffle=True, stratify=gender)

In [12]:
blogs = btr.Blog.values
pos = btr.POS.values

In [13]:
cvw = CountVectorizer(ngram_range=(2, 7), binary=True, max_features=None)
cvp = CountVectorizer(ngram_range=(2, 7), binary=True, max_features=None)

In [14]:
b = cvw.fit_transform(blogs.astype(str))
p = cvp.fit_transform(pos.astype(str))

In [15]:
blogs_test = bte.Blog.values
pos_test = bte.POS.values

In [16]:
bt = cvw.transform(blogs_test.astype(str))
pt = cvp.transform(pos_test.astype(str))

In [17]:
btr = btr.drop(columns=['Blog', 'POS'])
bte = bte.drop(columns=['Blog', 'POS'])

In [18]:
trainf = hstack((b,p,btr), format='csr')
testf = hstack((bt,pt,bte), format='csr')

In [27]:
transformer = GenericUnivariateSelect(chi2, 'k_best', param=50000)
trainfbest = transformer.fit_transform(trainf, gtr)
testfbest = transformer.transform(testf)

In [29]:
clf = MLPClassifier((75,75), solver='adam', activation='identity', max_iter=2500)
clf.fit(trainfbest, gtr)
clf.score(testfbest, gte)

0.6189290161892902

In [19]:
transformer = GenericUnivariateSelect(chi2, 'k_best', param=50000)
fbest = transformer.fit_transform(vstack((trainf, testf), format='csr'), vstack((gtr, gte), format='csr'))

ValueError: blocks[:,0] has incompatible row dimensions. Got blocks[1,0].shape[1] == 803, expected 2409.