In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import GenericUnivariateSelect, chi2
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
from keras import backend as K
from scipy.sparse import hstack

In [2]:
data = pd.read_csv("/home/binoy/OneDrive/UH/COSC 6342 - Machine Learning/Project/GenderClassification/data/blog-gender-dataset_csv.csv")

In [3]:
male_count = len(data[data.Gender == 'M'].index)
female_count = len(data[data.Gender == 'F'].index)
frac = female_count / male_count

data_new = data.copy()
data_new = data_new.drop(data_new[data_new.Gender == 'M'].sample(frac=1 - frac).index)
data_new['Blog'] = data_new['Blog'].values.astype(str)

In [4]:
data_train, data_holdout, gender_train, gender_holdout = train_test_split(data_new.Blog.astype(str), data_new.Gender.astype(str), test_size=0.1, shuffle=True, stratify=data_new.Gender)

In [5]:
data_train, data_val, gender_train, gender_val = train_test_split(data_train, gender_train, test_size=0.15, shuffle=True, stratify=gender_train)

In [6]:
cv_char = CountVectorizer(binary=True, ngram_range=(2, 7), analyzer="char", min_df=0.2)
data_train_char = cv_char.fit_transform(data_train)
data_val_char = cv_char.transform(data_val)
data_holdout_char = cv_char.transform(data_holdout)

In [7]:
cv_word = CountVectorizer(binary=True, ngram_range=(2, 7), min_df=0.2)
data_train_word = cv_word.fit_transform(data_train)
data_val_word = cv_word.transform(data_val)
data_holdout_word = cv_word.transform(data_holdout)

In [8]:
data_train = hstack((data_train_char, data_train_word), format='csr')
data_val = hstack((data_val_char, data_val_word), format='csr')
data_holdout = hstack((data_holdout_char, data_holdout_word), format='csr')

In [107]:
n_features = 50000
transformer = GenericUnivariateSelect(chi2, 'k_best', param=n_features)
data_train = transformer.fit_transform(data_train, gender_train)
data_val = transformer.transform(data_val)
data_holdout = transformer.transform(data_holdout)

ValueError: k should be >=0, <= n_features = 3632; got 50000. Use k='all' to return all features.

In [9]:
gender_train_coded = [1 if x == 'M' else 0 for x in gender_train]
gender_val_coded = [1 if x == 'M' else 0 for x in gender_val]
gender_holdout_coded = [1 if x == 'M' else 0 for x in gender_holdout]

In [21]:
def gen_model(n_layers=10, n_units=45, dropout=0.1, activation='relu'):
    K.get_session().graph.get_collection('variables')
    K.clear_session()
    model = Sequential([Dense(n_units, input_shape=(3701,)), Activation('relu')])
    model.add(BiD)
    while n_layers > 0:
        model.add(Dense(n_units))
        model.add(Dropout(dropout))
        model.add(Activation(activation))
        n_layers -= 1
    
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [22]:
def print_acc(model, threshold, p=False):
    correct = 0.
    total = 0.
    incorrect = 0.
    from math import exp
    for f, g in zip(data_val, gender_val_coded):
        total += 1
        prediction = model.predict(f)
        prediction = exp(prediction) / (1 + exp(prediction))
        prediction = 1 if prediction > threshold else 0
        if p:
            print(prediction)
        if prediction == g:
            correct += 1
        else: 
            incorrect += 1

    print(correct / total)
    print(incorrect / total)

In [27]:
model = gen_model(5, 5, 0.1)
model.fit(data_train, gender_train_coded, epochs=100, batch_size=32)
print_acc(model, 0.5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
0.5096153846153846
0.49038461538461536


In [49]:
correct = 0.
total = 0.
incorrect = 0.
from math import exp
for f, g in zip(data_holdout, gender_holdout_coded):
    total += 1
    prediction = model.predict(f)
    prediction = exp(prediction) / (1 + exp(prediction))
    prediction = 1 if prediction > 0.5 else 0
    if prediction == g:
        correct += 1
    else: 
        incorrect += 1

print(correct / total)
print(incorrect / total)

0.49838187702265374
0.5016181229773463


In [197]:
model.save('../models/keras_model_char_ngrams_2_7.h5')

In [61]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [117]:
svc = SVC(C=100, gamma='auto')
mlp = MLPClassifier(hidden_layer_sizes=(65, 45), activation='identity', early_stopping=True, max_iter=2500, random_state=50)

In [118]:
svc.fit(data_train, gender_train)
svc.score(data_val, gender_val)

0.6274038461538461

In [119]:
mlp.fit(data_train, gender_train)
mlp.score(data_val, gender_val)

0.6418269230769231

In [120]:
mlp2 = MLPClassifier(hidden_layer_sizes=(55, 45), activation='identity', early_stopping=True, max_iter=2500, random_state=65)
mlp3 = MLPClassifier(hidden_layer_sizes=(45, 65), activation='identity', early_stopping=True, max_iter=2500, random_state=75)
mlp4 = MLPClassifier(hidden_layer_sizes=(25, 45), activation='identity', early_stopping=True, max_iter=2500, random_state=40)
mlp5 = MLPClassifier(hidden_layer_sizes=(45, 25), activation='identity', early_stopping=True, max_iter=2500, random_state=51)

In [121]:
mlp2.fit(data_train, gender_train)
mlp3.fit(data_train, gender_train)
mlp4.fit(data_train, gender_train)
mlp5.fit(data_train, gender_train)

MLPClassifier(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(45, 25), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=51, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [122]:
correct = 0.
total = 0.
for f, g in zip(data_holdout, gender_holdout):
    total += 1
    cm = 0
    cf = 0
    predictions = [mlp.predict(f), mlp2.predict(f), mlp3.predict(f), mlp4.predict(f), mlp5.predict(f)]
    for pred in predictions:
        if pred == 'M':
            cm += 1
        else:
            cf += 1
    
    if cm > cf:
        prediction = 'M'
    else:
        prediction = 'F'
        
    if prediction == g:
        correct += 1
        
print("Voting accuracy = ", correct/total)

Voting accuracy =  0.6440129449838188


In [99]:
from keras.models import load_model

In [100]:
msaved = load_model("/home/binoy/OneDrive/UH/COSC 6342 - Machine Learning/Project/GenderClassification/models/keras/keras_model_char_ngrams_2_7.h5")

In [123]:
correct = 0.
total = 0.
incorrect = 0.
from math import exp
for f, g in zip(data_holdout, gender_holdout_coded):
    total += 1
    prediction = msaved.predict(f)
    prediction = exp(prediction) / (1 + exp(prediction))
    prediction = 1 if prediction > 0.5 else 0
    if prediction == g:
        correct += 1
    else: 
        incorrect += 1

print(correct / total)
print(incorrect / total)

ValueError: Error when checking input: expected dense_76_input to have shape (50000,) but got array with shape (3632,)