In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import time
import glob
import datetime
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

  from pandas.core import datetools


In [2]:
data = pd.read_csv("../Data/data.csv")

In [3]:
data['g'] = data['gender'].str.contains('female')

In [4]:
texts = data['line'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[2, 3], 
                             stop_words='english',
                             binary=False)

In [5]:
model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the  text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data['g']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.51712971  0.51577926  0.52175943], Average AUC 0.5182228010501405


In [6]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print( feature_importances.sort_values('Importance Score', ascending=False).head(50))

             Features  Importance Score
362      king landing          0.008629
639       night watch          0.008590
581    master kraznys          0.005926
240         gift gods          0.005917
509    lord commander          0.005563
505      lord baelish          0.005335
163          don know          0.005024
907           ve seen          0.004901
442       little dove          0.004794
181          don want          0.004776
774    seven kingdoms          0.004447
15         arya stark          0.004340
44          bran wake          0.004202
791          son king          0.004122
17          aunt lysa          0.004080
328          jon snow          0.003998
62       castle black          0.003880
821         sun stars          0.003725
432       like father          0.003597
140         didn want          0.003524
364        king north          0.003524
523        lord light          0.003267
319       iron throne          0.003253
754     ser barristan          0.003252
