In [1]:
%config IPCompleter.greedy=True

In [2]:
import pandas as pd

In [3]:
blog_data_df = pd.read_excel('../data/blog-gender-dataset.xlsx')

In [4]:
print(blog_data_df.columns)

Index(['Blog', 'Gender'], dtype='object')


In [5]:
print(blog_data_df['Gender'].value_counts())

M    1679
F    1548
Name: Gender, dtype: int64


In [6]:
def modify_gender(gender):
    if gender == 'M':
        return 1
    else:
        return 0

In [7]:
blog_data_df['Gender'] = blog_data_df['Gender'].map(modify_gender)

In [8]:
from nltk.corpus import stopwords

In [9]:
T^T

In [10]:
def remove_stop(blog):
    tokens = str(blog).split(" ")
    final_tokens = []
    for token in tokens:
        if (token != " " or len(token) != 0) and token not in stop:
            final_tokens.append(token)
    return " ".join(final_tokens)

In [11]:
blog_data_df['Blog'] = blog_data_df['Blog'].map(remove_stop)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
blogs = blog_data_df['Blog'].values
genders = blog_data_df['Gender'].values

blog_train, blog_test, gender_train, gender_test = train_test_split(blogs, genders, test_size=0.25, random_state=100)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(blog_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
blog_train_transformed = vectorizer.transform(blog_train)
blog_test_transformed = vectorizer.transform(blog_test)

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
classifier = LogisticRegression()
classifier.fit(blog_train_transformed, gender_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
accuracy = classifier.score(blog_test_transformed, gender_test)
print('Accuracy = {:.4f}'.format(accuracy))

Accuracy = 0.6989


In [19]:
from sklearn.svm import SVC

In [20]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(blog_train_transformed, gender_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [21]:
accuracy = svm_classifier.score(blog_test_transformed, gender_test)
print('Accuracy = {:.4f}'.format(accuracy))

Accuracy = 0.6828


In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [23]:
names = ["Logistic Regression", "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", 
         "Random Forest", "Neural Net", "AdaBoost"]

In [24]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier()]

In [25]:
for name, classifier in zip(names, classifiers):
    classifier.fit(blog_train_transformed, gender_train)
    accuracy = classifier.score(blog_test_transformed, gender_test)
    print('Accuracy for {} classifier = {:.4f}'.format(name, accuracy))

Accuracy for Logistic Regression classifier = 0.6989
Accuracy for Nearest Neighbors classifier = 0.5366
Accuracy for Linear SVM classifier = 0.6976
Accuracy for RBF SVM classifier = 0.5006
Accuracy for Decision Tree classifier = 0.6208
Accuracy for Random Forest classifier = 0.4994
Accuracy for Neural Net classifier = 0.7100
Accuracy for AdaBoost classifier = 0.6592


In [32]:
neural_classifier = MLPClassifier((100,5), alpha=1e-4)
neural_classifier.fit(blog_train_transformed, gender_train)
accuracy = neural_classifier.score(blog_test_transformed, gender_test)
print('Accuracy for MLPClassifier classifier = {:.4f}'.format(accuracy))

Accuracy for MLPClassifier classifier = 0.7063


In [34]:
from itertools import product

In [66]:
n_layers = list(range(5, 50, 10))
n_units = list(range(5, 50, 10))
depth = list(product(n_layers, n_units))
activations = ['relu']
solvers = ['sgd', 'adam']
learning_rates = ['constant', 'adaptive']
max_iters = [200, 400]
random_state = [42]
early_stopping = [True]

In [67]:
params_grid = {
    'hidden_layer_sizes': depth,
    'activation': activations,
    'solver': solvers,
    'learning_rate': learning_rates,
    'max_iter': max_iters,
    'random_state': random_state,
    'early_stopping': early_stopping
}

In [68]:
from sklearn.model_selection import GridSearchCV

In [69]:
grid = GridSearchCV(MLPClassifier(), params_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

In [70]:
grid_result = grid.fit(blog_train_transformed, gender_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 31.3min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 82.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 178.1min finished


In [71]:
print(grid_result.best_score_)
print(grid_result.best_params_)

0.7016528925619835
{'activation': 'relu', 'early_stopping': True, 'hidden_layer_sizes': (25, 35), 'learning_rate': 'constant', 'max_iter': 200, 'random_state': 42, 'solver': 'adam'}


In [72]:
test_accuracy = grid.score(blog_test_transformed, gender_test)
print(test_accuracy)

0.7298636926889716
