In [1]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle('../data/feature_dumps/features.pkl')

In [3]:
df.columns.values

array(['Blog', 'Gender', 'POS', 'FMeasure', 'CharLength', 'TFPunctuation',
       'TFStopWords', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       'f9', 'f10', 'f11', 'ConversationCount', 'AtHomeCount',
       'FamilyCount', 'TimeCount', 'WorkCount', 'PastActionsCount',
       'GamesCount', 'InternetCount', 'LocationCount', 'FunCount',
       'Food/ClothesCount', 'PoeticCount', 'Books/MoviesCount',
       'ReligionCount', 'RomanceCount', 'SwearingCount', 'PoliticsCount',
       'MusicCount', 'SchoolCount', 'BusinessCount', 'PositiveCount',
       'NegativeCount', 'EmotionCount', 'ProperNounCount',
       'SentenceCount', 'AvgSentLength', 'NN', 'NNPS', 'VBD', 'VBZ', 'MD',
       'EX', 'IN', 'VB', 'JJR', 'JJS', 'PRP', 'WDT', 'JJ', 'VBP', 'NNS',
       'VBN', 'DT', 'RB', 'WP', 'VBG', 'NNP', 'RBR', 'PRP$', 'JJ NN',
       'VBP VB', 'VBD PRP', 'IN IN', 'NNP VBZ', 'RB DT', 'NN VBG',
       'IN JJ', 'NN NN', 'RB VBZ', 'VBG DT', 'NN NNS', 'VBZ JJ', 'IN RB',
       'JJ JJ', 'NN VBZ', 'IN V

In [77]:
dfother = df.loc[:, (df.columns != 'Gender') & (df.columns != 'Blog')]

In [78]:
dftarget = df.Gender

In [79]:
dfBlog = df.Blog

In [80]:
cv = CountVectorizer()
blog_train = cv.fit_transform(dfBlog)

In [81]:
bt = hstack((blog_train, dfother), format='csr')

In [82]:
clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1)

In [83]:
clf.fit(bt, dftarget)
clf.score(bt, dftarget)

1.0

In [31]:
clf.feature_importances_

array([4.06360093e-05, 2.91184892e-04, 6.86725953e-06, ...,
       5.47559813e-04, 1.21165605e-03, 8.69622028e-04])

In [32]:
bt.shape

(3212, 52749)

In [33]:
model = SelectFromModel(clf, prefit=True)
bt_new = model.transform(bt)

In [34]:
bt_new.shape

(3212, 7954)

In [41]:
bt_new.todense()

0.0

In [52]:
from sklearn.svm import LinearSVC, SVC

In [53]:
svmclf = LinearSVC(max_iter=-1)

In [54]:
svmclf.fit(bt_new, dftarget)
svmclf.score(bt_new, dftarget)



0.47976338729763385

In [63]:
svc = SVC(gamma='auto', C=20)
svc.fit(bt_new, dftarget)
svc.score(bt_new, dftarget)

0.9772727272727273

In [64]:
svcdefault = SVC(gamma='auto', C=20)
svcdefault.fit(bt, dftarget)
svcdefault.score(bt, dftarget)

0.8029265255292652

In [4]:
from sklearn.model_selection import train_test_split
from random import randint

In [5]:
male_count = len(df[df.Gender == 'M'].index)
female_count = len(df[df.Gender == 'F'].index)
frac = female_count / male_count

blog_new = df.copy()
blog_new = blog_new.drop(blog_new[blog_new.Gender == 'M'].sample(frac=1 - frac).index)
blog_new['Blog'] = blog_new['Blog'].values.astype(str)

blog_train, blog_test, gender_train, gender_test = train_test_split(blog_new.loc[:, blog_new.columns != 'Gender'],
                                                                    blog_new.Gender, test_size=0.25,
                                                                    random_state=randint(1, 101), shuffle=True,
                                                                    stratify=blog_new.Gender)

In [6]:
cv = CountVectorizer()
cv.fit(blog_train.Blog)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
btr = cv.transform(blog_train.Blog)
bte = cv.transform(blog_test.Blog)

In [13]:
blog_train = hstack((btr, blog_train.loc[:, (blog_train.columns != 'Blog') | (blog_train.columns != 'POS')]), format='csr')
blog_test = hstack((bte, blog_test.loc[:, blog_test.columns != 'Blog']), format='csr')

TypeError: no supported conversion for types: (dtype('int64'), dtype('O'))

In [134]:
clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1)
clf.fit(blog_train, gender_train)
clf.score(blog_test, gender_test)

0.6459143968871596

In [135]:
model = SelectFromModel(clf, prefit=True)

In [136]:
blog_train = model.transform(blog_train)
blog_test = model.transform(blog_test)

In [137]:
clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1)
clf.fit(blog_train, gender_train)
clf.score(blog_test, gender_test)

0.6083009079118028

In [138]:
svm = SVC(gamma='auto', C=20)
svm.fit(blog_train, gender_train)
svm.score(blog_test, gender_test)

0.5758754863813229

In [139]:
from sklearn.neural_network import MLPClassifier

In [140]:
mlp = MLPClassifier(hidden_layer_sizes=(35, 45), activation='relu', learning_rate='constant',
                                   max_iter=300, early_stopping=True, solver='adam')

In [141]:
mlp.fit(blog_train, gender_train)
mlp.score(blog_test, gender_test)

0.6575875486381323

In [142]:
from xgboost import XGBClassifier

In [173]:
xgb = XGBClassifier(n_jobs=-1, max_depth=3, n_estimators=150, booster='dart', importance_type='total_gain')

In [174]:
xgb.fit(blog_train, gender_train)
xgb.score(blog_test, gender_test)

0.6731517509727627