In [56]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import utils
from tqdm import tqdm
from nltk import sent_tokenize
tqdm.pandas(desc="progress-bar")

In [2]:
data = pd.read_csv('../data/blog-gender-dataset_csv.csv')

In [3]:
data = data.drop(columns=['Unnamed: 0'])

In [4]:
train_data, test_data = train_test_split(data, test_size=0.25, random_state=24)

In [5]:
documents_train = train_data.apply(lambda entry: TaggedDocument(str(entry['Blog']).split(" "), entry['Gender']), axis=1)
documents_test = test_data.apply(lambda entry: TaggedDocument(str(entry['Blog']).split(" "), entry['Gender']), axis=1)

In [13]:
model_dm = Doc2Vec(documents_train, dm=1, vector_size=600, negative=5, hs=1, min_count=5, sample = 0, workers=4, dm_concat=1, window=5)

In [58]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=20)) for doc in tqdm(sents)])
    return targets, regressors

In [15]:
y_train, X_train = vec_for_learning(model_dm, documents_train)
y_test, X_test = vec_for_learning(model_dm, documents_test)

100%|██████████| 2409/2409 [22:51<00:00,  2.59it/s]  
100%|██████████| 803/803 [07:32<00:00,  2.31it/s]


In [16]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)



In [17]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.6376089663760897
Testing F1 score: 0.6344668918616911


In [18]:
svc = SVC(C=13500, gamma='auto')
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.6475716064757161

In [19]:
mlp = MLPClassifier(hidden_layer_sizes=(45, 45), max_iter=2500, early_stopping=True, activation='identity')
mlp.fit(X_train, y_train)
mlp.score(X_test, y_test)

0.676214196762142

In [57]:
documents_train_sent = train_data.apply(lambda entry: TaggedDocument(sent_tokenize(str(entry['Blog'])), entry['Gender']), axis=1)
documents_test_sent = test_data.apply(lambda entry: TaggedDocument(sent_tokenize(str(entry['Blog'])), entry['Gender']), axis=1)

In [60]:
model_dm = Doc2Vec(documents_train, dm=1, vector_size=600, negative=5, hs=1, min_count=5, sample = 0, workers=4, dm_concat=1, window=5)

In [63]:
y_train_sent, X_train_sent = vec_for_learning(model_dm, documents_train_sent)
y_test_sent, X_test_sent = vec_for_learning(model_dm, documents_test_sent)







  0%|          | 0/2409 [00:00<?, ?it/s][A[A[A[A[A[A





 13%|█▎        | 308/2409 [00:00<00:00, 3079.99it/s][A[A[A[A[A[A





 31%|███▏      | 758/2409 [00:00<00:00, 3400.96it/s][A[A[A[A[A[A





 47%|████▋     | 1125/2409 [00:00<00:00, 3477.28it/s][A[A[A[A[A[A





 61%|██████▏   | 1480/2409 [00:00<00:00, 3497.21it/s][A[A[A[A[A[A





 78%|███████▊  | 1876/2409 [00:00<00:00, 3620.95it/s][A[A[A[A[A[A





 93%|█████████▎| 2234/2409 [00:00<00:00, 3608.03it/s][A[A[A[A[A[A





100%|██████████| 2409/2409 [00:00<00:00, 3648.78it/s][A[A[A[A[A[A





  0%|          | 0/803 [00:00<?, ?it/s][A[A[A[A[A[A





 42%|████▏     | 341/803 [00:00<00:00, 3406.67it/s][A[A[A[A[A[A





 83%|████████▎ | 665/803 [00:00<00:00, 3354.88it/s][A[A[A[A[A[A





100%|██████████| 803/803 [00:00<00:00, 3278.10it/s][A[A[A[A[A[A

In [65]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train_sent, y_train)
print(logreg.score(X_test_sent, y_test))
svc = SVC(C=13500, gamma='auto')
svc.fit(X_train_sent, y_train)
print(svc.score(X_test_sent, y_test))
mlp = MLPClassifier(hidden_layer_sizes=(45, 45), max_iter=2500, early_stopping=True, activation='identity')
mlp.fit(X_train_sent, y_train)
print(mlp.score(X_test_sent, y_test))



0.48941469489414696
0.5180572851805728
0.5130759651307597
