In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, validation_curve
import pandas as pd
from pathlib import Path
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC


In [2]:
gbc_params = {
    "learning_rate": 0.85,
    "max_features": 0.05,
    "max_depth": 10,
    "min_samples_split": 520,
    "min_samples_leaf": 15,
    "n_estimators": 400
}

abc_params = {
    "algorithm": "SAMME.R",
    "learning_rate": 0.90,
    "n_estimators": 500
}

lsvc_params = {"C": 0.1}

In [3]:
# train data
base = Path().resolve().parents[1] / r'data/subsets/gender_final_small'
locations = [str(base) + str(i) + '.csv' for i in range(1, 10)]

# Have 9000 names sampled at random.
X = pd.concat([pd.read_csv(location) for location in locations], axis=0)
y = X["Gender"]
X.drop(labels="Gender", inplace=True, axis=1)

# validation data
validation_data = Path().resolve().parents[1] / 'data/gender_final_small.csv'
X_val = pd.read_csv(validation_data)
y_val = X_val["Gender"]
X_val.drop(labels="Gender", inplace=True, axis=1)

In [4]:
# vectorize and convert to floats
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,10), decode_error='replace', binary=True)
scaler = MaxAbsScaler()

X = vectorizer.fit_transform(X['Name'])
X = scaler.fit_transform(X)

X_val = vectorizer.transform(X_val['Name'])
X_val = scaler.transform(X_val)

In [5]:
abc_clf = AdaBoostClassifier(algorithm=abc_params["algorithm"],
                             n_estimators=abc_params["n_estimators"],
                             learning_rate=abc_params["learning_rate"],
                             random_state=0)

gbc_clf = GradientBoostingClassifier(learning_rate=gbc_params["learning_rate"],
                                     max_features=gbc_params["max_features"],
                                     min_samples_split=gbc_params["min_samples_split"],
                                     n_estimators=gbc_params["n_estimators"],
                                     max_depth=gbc_params["max_depth"],
                                     min_samples_leaf=gbc_params["min_samples_leaf"],
                                     random_state=0)

lsvc_clf = LinearSVC(C=lsvc_params['C'])


In [11]:
gbc_clf.fit(X, y)
selector = SelectFromModel(gbc_clf, prefit=True)
X_selected = selector.transform(X)
print("Before: ", X.shape)
print("After: ", X_selected.shape)

Before:  (9000, 63889)
After:  (9000, 1088)


In [12]:
gbc_clf.score(X, y)

0.9297777777777778

In [15]:
X_val_selected = selector.transform(X_val)
print(X_val_selected.shape)

(1000, 1088)


In [22]:
lsvc_clf.fit(X, y)
print("Using all features:Training ", lsvc_clf.score(X, y))
print("Using all features:Validation ", lsvc_clf.score(X_val, y_val))

selector = SelectFromModel(lsvc_clf, prefit=True)
X_selected = selector.transform(X)
X_val_selected = selector.transform(X_val)
print("Shaped reduced from {} to {}, difference is {}".format(X.shape[1],
                                                              X_selected.shape[1],
                                                              X.shape[1] - X_selected.shape[1]))

lsvc_clf.fit(X_selected, y)
print("Using selected features:Training ", lsvc_clf.score(X_selected, y))
print("Using selected features:Validation ", lsvc_clf.score(X_val_selected, y_val))

Using all features:Training  0.9934444444444445
Using all features:Validation  0.838
Shaped reduced from 63889 to 23805, difference is 40084
Using selected features:Training  0.9921111111111112
Using selected features:Validation  0.841


In [23]:
abc_clf.fit(X, y)
print("Using all features:Training ", abc_clf.score(X, y))
print("Using all features:Validation ", abc_clf.score(X_val, y_val))

selector = SelectFromModel(abc_clf, prefit=True)
X_selected = selector.transform(X)
X_val_selected = selector.transform(X_val)
print("Shaped reduced from {} to {}, difference is {}".format(X.shape[1],
                                                              X_selected.shape[1],
                                                              X.shape[1] - X_selected.shape[1]))

abc_clf.fit(X_selected, y)
print("Using selected features:Training ", abc_clf.score(X_selected, y))
print("Using selected features:Validation ", abc_clf.score(X_val_selected, y_val))

Using all features:Training  0.8517777777777777
Using all features:Validation  0.786
Shaped reduced from 63889 to 434, difference is 63455
Using selected features:Training  0.8517777777777777
Using selected features:Validation  0.786


In [19]:
gbc_clf.fit(X, y)
print("Using all features:Training ", gbc_clf.score(X, y))
print("Using all features:Validation ", gbc_clf.score(X_val, y_val))

selector = SelectFromModel(gbc_clf, prefit=True)
X_selected = selector.transform(X)
X_val_selected = selector.transform(X_val)
print("Shaped reduced from {} to {}, difference is {}".format(X.shape[1],
                                                              X_selected.shape[1],
                                                              X.shape[1] - X_selected.shape[1]))

gbc_clf.fit(X_selected, y)
print("Using selected features:Training ", gbc_clf.score(X_selected, y))
print("Using selected features:Validation ", gbc_clf.score(X_val_selected, y_val))

Using all features:Training  0.9297777777777778
Using all features:Validation  0.8
Using selected features:Training  0.9295555555555556
Using selected features:Validation  0.816
