In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from random import randint

In [3]:
features = pd.read_pickle('../data/feature_dumps/features.pkl')

In [4]:
def get_features(df, use_og_words=True):
    male_count = len(df[df.Gender == 'M'].index)
    female_count = len(df[df.Gender == 'F'].index)
    frac = female_count / male_count

    blog_new = df.copy()
    blog_new = blog_new.drop(blog_new[blog_new.Gender == 'M'].sample(frac=1 - frac).index)
    blog_new['Blog'] = blog_new['Blog'].values.astype(str)

    blog_train, blog_test, gender_train, gender_test = train_test_split(blog_new.loc[:, blog_new.columns != 'Gender'],
                                                                        blog_new.Gender, test_size=0.25,
                                                                        random_state=randint(1, 101), shuffle=True,
                                                                        stratify=blog_new.Gender)
    test_blogs = blog_test.copy()
    test_gender = gender_test.copy()

    cvectorizer = CountVectorizer()
    cvectorizer.fit(blog_train['Blog'])

    transformer = cvectorizer

    feature_set = ['FMeasure', 'CharLength', 'TFPunctuation', 'TFStopWords', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7',
                   'f8', 'f9', 'f10', 'f11', 'ConversationCount', 'AtHomeCount', 'FamilyCount', 'TimeCount',
                   'WorkCount', 'PastActionsCount', 'GamesCount', 'InternetCount', 'LocationCount', 'FunCount',
                   'Food/ClothesCount', 'PoeticCount', 'Books/MoviesCount', 'ReligionCount', 'RomanceCount',
                   'SwearingCount', 'PoliticsCount', 'MusicCount', 'SchoolCount', 'BusinessCount', 'PositiveCount',
                   'NegativeCount', 'EmotionCount', 'ProperNounCount', 'SentenceCount', 'AvgSentLength', 'NN', 'NNPS',
                   'VBD', 'VBZ', 'MD', 'EX', 'IN', 'VB', 'JJR', 'JJS', 'PRP', 'WDT', 'JJ', 'VBP', 'NNS', 'VBN', 'DT',
                   'RB', 'WP', 'VBG', 'NNP', 'RBR', 'PRP$', 'JJ NN', 'VBP VB', 'VBD PRP', 'IN IN', 'NNP VBZ', 'RB DT',
                   'NN VBG', 'IN JJ', 'NN NN', 'RB VBZ', 'VBG DT', 'NN NNS', 'VBZ JJ', 'IN RB', 'JJ JJ', 'NN VBZ',
                   'IN VBG', 'VBP DT', 'VB NN', 'NNS VBP', 'DT NNP', 'PRP VBZ', 'PRP VBD', 'PRP VB', 'NN PRP', 'NN DT',
                   'VBZ VB', 'PRP VBP', 'PRP$ JJ', 'VBD IN', 'VB JJ', 'NN JJ', 'RB VB', 'JJ NNP', 'RB VBG', 'VBZ PRP',
                   'VBD DT', 'RB RB', 'JJ VB', 'PRP RB', 'JJ IN', 'VBD VB', 'VB IN', 'VBP PRP', 'VBD RB', 'VBG IN',
                   'PRP IN', 'VB PRP', 'NN RB', 'NNP NN', 'VB VBN', 'NN NNP', 'IN NN', 'VBP VBN', 'NN WDT', 'RB IN',
                   'DT JJ', 'RB VBD', 'VBZ IN', 'NN MD', 'VB DT', 'NNS IN', 'NNP RB', 'VB PRP$', 'VBP IN', 'RB VBP',
                   'NNS RB', 'DT NN', 'VBZ VBN', 'MD RB', 'NNP IN', 'NN VBD', 'JJ NNS', 'NN VB', 'IN PRP$', 'MD VB',
                   'RB PRP', 'NNS VB', 'VBZ DT', 'VBG NN', 'VBN IN', 'PRP$ NNS', 'VB VB', 'VBP RB', 'NNP NNP', 'NN IN',
                   'VB RB', 'VBG PRP', 'PRP MD', 'IN DT', 'NNP VBD', 'IN NNS', 'IN NNP', 'RB JJ', 'IN PRP', 'VBD JJ',
                   'RB VBN', 'DT NNS', 'VBD VBN', 'PRP$ NN', 'VBZ RB', 'VBP JJ', 'DT NN VBZ', 'PRP VBD IN', 'JJ NNP NN',
                   'DT NNP NNP', 'RB IN PRP', 'IN JJ NN', 'JJ NN VB', 'IN NN IN', 'IN DT NNS', 'NNS IN PRP',
                   'NN IN NNS', 'VBN IN DT', 'PRP VBP JJ', 'DT NNP NN', 'VBZ DT NN', 'PRP VBP RB', 'IN DT JJ',
                   'MD RB VB', 'JJ NN IN', 'RB DT NN', 'VBG DT NN', 'NNP NN NN', 'DT NNS IN', 'DT NN IN', 'JJ NN NN',
                   'VB DT NN', 'DT JJ NNS', 'NN IN PRP$', 'IN JJ NNS', 'NNS IN NN', 'VBZ DT JJ', 'PRP VBP DT',
                   'NNP NNP NN', 'PRP VBP PRP', 'VB DT JJ', 'VB PRP$ NN', 'DT JJ JJ', 'IN PRP VBP', 'DT NN VB',
                   'IN NNP NNP', 'NNS IN DT', 'VB IN DT', 'IN DT NNP', 'NN IN NNP', 'IN PRP$ NN', 'DT JJ NN',
                   'NN NN IN', 'IN NN NN', 'VBP DT NN', 'PRP MD VB', 'PRP$ NN NN', 'RB JJ NN', 'DT NN NN', 'IN PRP$ JJ',
                   'VBD DT JJ', 'PRP VBP VB', 'NN NN NN', 'VBD DT NN', 'NNP NNP NNP', 'PRP VBD VB', 'JJ NNS IN',
                   'IN DT NN', 'NN IN NN', 'PRP VBP IN', 'PRP VBD DT', 'IN PRP VBD', 'NN IN JJ', 'DT NN RB',
                   'NN IN PRP', 'NN IN DT', 'RB IN DT', 'JJ JJ NN', 'PRP$ NN IN', 'RB PRP VBP', 'PRP$ JJ NN',
                   'IN DT JJ NN', 'NN IN DT NN', 'NN IN PRP$ NN', 'NN IN DT JJ', 'IN DT NN VB', 'DT JJ NN IN',
                   'IN DT NN NN', 'DT JJ NN NN', 'VB DT JJ NN', 'IN DT NNP NN', 'DT NN IN PRP', 'JJ NN IN DT',
                   'DT NN IN NN', 'NNS IN DT NN', 'VB DT NN IN', 'NN IN JJ NN', 'DT NN IN DT', 'JJ NN IN NN',
                   'IN DT NN IN', 'DT NN IN DT NN', 'NN IN DT JJ NN', 'DT JJ NN IN NN', 'JJ NN IN DT NN',
                   'UpperCaseChars', 'UpperCaseWords', 'TitleCaseWords', 'DT VBZ', 'DT RB', 'VBD NN',
                   'NNP VB', 'IN NNP NN', 'JJ NNS VB', 'PRP$ NN VB', 'VBN IN NN', 'NN NN VB', 'PRP RB VB', 'VBZ NN',
                   'DT IN', 'DT VB', 'RB NN', 'WP VB', 'VBP NN', 'DT DT', 'VBN NN', 'IN VB', 'WDT VB', 'IN PRP VB',
                   'NNP NNP VB', 'NN IN VB', 'RB PRP VB', 'NN PRP VB']

    if use_og_words:
        blog_train = hstack((transformer.transform(blog_train.Blog),
                             blog_train[feature_set]), format='csr')
        blog_test = hstack((transformer.transform(blog_test.Blog),
                            blog_test[feature_set]), format='csr')
    else:
        blog_train = blog_train[feature_set]
        blog_test = blog_test[feature_set]

    return blog_train, blog_test, gender_train, gender_test, test_blogs, test_gender

In [5]:
blog_train, blog_test, gender_train, gender_test, test_blogs, test_gender = get_features(features)

In [7]:
svc = SVC(kernel='linear', max_iter=1000)
rfecv = RFECV(estimator=svc, cv=StratifiedKFold(2), scoring='accuracy')

In [8]:
rfecv.fit(blog_train, gender_train)



KeyboardInterrupt: 

In [None]:
print(rfecv.n_features_)
print(rfecv.grid_scores_)