In [6]:
import pandas as pd
import numpy as np
from pickle import dump, load

In [7]:
features = pd.read_pickle('../data/feature_dumps/features.pkl')

In [3]:
with open('../data/POS_Tags.pkl', 'rb') as pkldump:
    pos_tags = load(pkldump)

In [4]:
pos_tags = list(map(lambda tags: ' '.join(tags), pos_tags))

In [5]:
features.insert(2, 'POS', pos_tags)

ValueError: cannot insert POS, already exists

In [34]:
pd.to_pickle(features, '../data/feature_dumps/features.pkl')

In [35]:
noun_set = {'NN', 'NNS', 'NNP', 'NNPS'}
adjective_set = {'JJ', 'JJR', 'JJS', 'WDT'}
preposition_set = {'IN'}
article_set = {'DET', 'DT'}
pronoun_set = {'PRP', 'PRP$', 'WP', 'WP$'}
adverb_set = {'RB', 'RBR', 'RBS', 'EX'}
interjection_set = {'UH'}
verb_set = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD'}

In [36]:
fm = []
for tags in features.POS.values:
    fnoun = fadj = fprep = fart = fpron = fverb = fadv = fint = 0
    for tag in tags.split():
        if tag in noun_set:
            fnoun += 1
        elif tag in adjective_set:
            fadj += 1
        elif tag in preposition_set:
            fprep += 1
        elif tag in article_set:
            fart += 1
        elif tag in pronoun_set:
            fpron += 1
        elif tag in adverb_set:
            fadv += 1
        elif tag in interjection_set:
            fint += 1
        elif tag in verb_set:
            fverb += 1
    f_measure = 0.5 * ((fnoun + fadj + fprep + fart) - (fpron + fverb + fadv + fint) + 100)
    fm.append(f_measure)

In [40]:
features['FMeasure'] = fm

In [42]:
pd.to_pickle(features, '../data/feature_dumps/features.pkl')

In [8]:
features.columns.values

array(['Blog', 'Gender', 'POS', 'FMeasure', 'CharLength', 'TFPunctuation',
       'TFStopWords', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       'f9', 'f10', 'f11', 'ConversationCount', 'AtHomeCount',
       'FamilyCount', 'TimeCount', 'WorkCount', 'PastActionsCount',
       'GamesCount', 'InternetCount', 'LocationCount', 'FunCount',
       'Food/ClothesCount', 'PoeticCount', 'Books/MoviesCount',
       'ReligionCount', 'RomanceCount', 'SwearingCount', 'PoliticsCount',
       'MusicCount', 'SchoolCount', 'BusinessCount', 'PositiveCount',
       'NegativeCount', 'EmotionCount', 'ProperNounCount',
       'SentenceCount', 'AvgSentLength', 'NN', 'NNPS', 'VBD', 'VBZ', 'MD',
       'EX', 'IN', 'VB', 'JJR', 'JJS', 'PRP', 'WDT', 'JJ', 'VBP', 'NNS',
       'VBN', 'DT', 'RB', 'WP', 'VBG', 'NNP', 'RBR', 'PRP$', 'JJ NN',
       'VBP VB', 'VBD PRP', 'IN IN', 'NNP VBZ', 'RB DT', 'NN VBG',
       'IN JJ', 'NN NN', 'RB VBZ', 'VBG DT', 'NN NNS', 'VBZ JJ', 'IN RB',
       'JJ JJ', 'NN VBZ', 'IN V

In [9]:
features[features.Gender == 'M'].FMeasure.values.mean()

104.69269898264513

In [10]:
features[features.Gender == 'F'].FMeasure.values.mean()

88.00778715120052

In [11]:
n_features = ['CharLength', 'TFPunctuation',
       'TFStopWords', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       'f9', 'f10', 'f11', 'ConversationCount', 'AtHomeCount',
       'FamilyCount', 'TimeCount', 'WorkCount', 'PastActionsCount',
       'GamesCount', 'InternetCount', 'LocationCount', 'FunCount',
       'Food/ClothesCount', 'PoeticCount', 'Books/MoviesCount',
       'ReligionCount', 'RomanceCount', 'SwearingCount', 'PoliticsCount',
       'MusicCount', 'SchoolCount', 'BusinessCount', 'PositiveCount',
       'NegativeCount', 'EmotionCount', 'ProperNounCount',
       'SentenceCount', 'AvgSentLength', 'UpperCaseChars',
       'UpperCaseWords', 'TitleCaseWords', 'FMeasure']

In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [14]:
scaler = MinMaxScaler()

In [15]:
scaled_features = scaler.fit_transform(features[['CharLength', 'TFPunctuation',
       'TFStopWords', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       'f9', 'f10', 'f11', 'ConversationCount', 'AtHomeCount',
       'FamilyCount', 'TimeCount', 'WorkCount', 'PastActionsCount',
       'GamesCount', 'InternetCount', 'LocationCount', 'FunCount',
       'Food/ClothesCount', 'PoeticCount', 'Books/MoviesCount',
       'ReligionCount', 'RomanceCount', 'SwearingCount', 'PoliticsCount',
       'MusicCount', 'SchoolCount', 'BusinessCount', 'PositiveCount',
       'NegativeCount', 'EmotionCount', 'ProperNounCount',
       'SentenceCount', 'AvgSentLength', 'UpperCaseChars',
       'UpperCaseWords', 'TitleCaseWords', 'FMeasure']])

  return self.partial_fit(X, y)


In [16]:
features[['CharLength', 'TFPunctuation',
       'TFStopWords', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       'f9', 'f10', 'f11', 'ConversationCount', 'AtHomeCount',
       'FamilyCount', 'TimeCount', 'WorkCount', 'PastActionsCount',
       'GamesCount', 'InternetCount', 'LocationCount', 'FunCount',
       'Food/ClothesCount', 'PoeticCount', 'Books/MoviesCount',
       'ReligionCount', 'RomanceCount', 'SwearingCount', 'PoliticsCount',
       'MusicCount', 'SchoolCount', 'BusinessCount', 'PositiveCount',
       'NegativeCount', 'EmotionCount', 'ProperNounCount',
       'SentenceCount', 'AvgSentLength', 'UpperCaseChars',
       'UpperCaseWords', 'TitleCaseWords', 'FMeasure']] = scaled_features

In [17]:
for feature in n_features:
    print("Means for ", feature)
    print("Male = ", features[features.Gender == 'M'][feature].values.mean(), 
         "Female = ", features[features.Gender == 'F'][feature].values.mean())
    print()

Means for  CharLength
Male =  0.07346503932184074 Female =  0.0689864719147496

Means for  TFPunctuation
Male =  0.04771450239140285 Female =  0.04805362792636811

Means for  TFStopWords
Male =  0.06028191885751045 Female =  0.058479253226439556

Means for  f1
Male =  0.016130154621240587 Female =  0.01260130087680908

Means for  f2
Male =  0.026290107259586613 Female =  0.018289821794039835

Means for  f3
Male =  0.02492091989398991 Female =  0.03198294243070362

Means for  f4
Male =  0.014961101137043686 Female =  0.01058565866320571

Means for  f5
Male =  0.030864751645721125 Female =  0.02310188189487346

Means for  f6
Male =  0.037159634949132256 Female =  0.031108046722907203

Means for  f7
Male =  0.030847070344377347 Female =  0.022948498613651114

Means for  f8
Male =  0.03920436236414693 Female =  0.039185344182099534

Means for  f9
Male =  0.011316032860018498 Female =  0.011090791103769688

Means for  f10
Male =  0.01286654697785757 Female =  0.016872160934458143

Means for

In [19]:
from math import fabs
for feature in n_features:
    mm = features[features.Gender == 'M'][feature].values.mean() 
    fm = features[features.Gender == 'F'][feature].values.mean()
    if 0.05 < fabs(mm - fm) < 0.08:
        print(feature, fabs(mm - fm))

In [20]:
pd.to_pickle(features, '../data/feature_dumps/scaled_features.pkl')