In [369]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from scipy.sparse import hstack

In [370]:
def clean(x):
    x = " ".join(filter(lambda x:x[0]!='@', x.split()))
    return x 

In [371]:
data = pd.read_csv('final.csv', index_col=0)
data = data.drop('Unnamed: 0.1',1)
class_names = ['gender', 'age']
train = data.iloc[0:900,:]
#train
test = data.iloc[901:,:]
#test

In [372]:
train_text = train['text']
test_text = test['text']
all_text = pd.concat([train_text, test_text])

In [373]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text.values.astype('U'))
train_word_features = word_vectorizer.transform(train_text.values.astype('U'))
test_word_features = word_vectorizer.transform(test_text.values.astype('U'))

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text.values.astype('U'))
train_char_features = char_vectorizer.transform(train_text.values.astype('U'))
test_char_features = char_vectorizer.transform(test_text.values.astype('U'))

#.values.astype('U')

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

# train_features = train_word_features
# test_features = test_word_features

# train_features = train_char_features
# test_features = test_char_features

In [374]:
scores = []
result = pd.DataFrame.from_dict({'id': test['id']})

for class_name in class_names:
    train_target = train[class_name]
    test_target = test[class_name]
    if class_name == "gender":
        classifier = LogisticRegression(solver='liblinear', multi_class='auto')
    else:
        classifier = LogisticRegression(solver='lbfgs', multi_class='multinomial')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    result[class_name] = classifier.predict_proba(test_features)[:, 1]
    print("{} Training score: {}".format( class_name.capitalize(),classifier.score(train_features, train_target)))
    print("{} Testing score: {}".format( class_name.capitalize() ,classifier.score(test_features, test_target)))
    print("{} F1 Micro score: {}".format(class_name.capitalize() , f1_score(classifier.predict(test_features),test_target, average='micro')))
    print("{} F1 Macro score: {}".format(class_name.capitalize() , f1_score(classifier.predict(test_features),test_target, average='macro')))

# Word-Char features
# CV score for class gender is 0.608852727993274
# Gender Training score: 0.9922222222222222
# Gender Testing score: 0.5802469135802469
# CV score for class age is 0.4643698273125281
# Age Training score: 0.9822222222222222
# Age Testing score: 0.49382716049382713

# Word features
# CV score for class gender is 0.5866525060154989
# Gender Training score: 0.9422222222222222
# Gender Testing score: 0.5679012345679012
# CV score for class age is 0.4689288061012795
# Age Training score: 0.9633333333333334
# Age Testing score: 0.48148148148148145

# Char features
# CV score for class gender is 0.5922339754392333
# Gender Testing score: 0.9244444444444444
# Gender Testing score: 0.6296296296296297
# CV score for class age is 0.471092013850436
# Age Testing score: 0.9155555555555556
# Age Testing score: 0.49382716049382713

CV score for class gender is 0.608852727993274
Gender Training score: 0.9922222222222222
Gender Testing score: 0.5802469135802469
Gender F1 Micro score: 0.5802469135802469
Gender F1 Macro score: 0.4535714285714285
CV score for class age is 0.4643698273125281
Age Training score: 0.9822222222222222
Age Testing score: 0.49382716049382713
Age F1 Micro score: 0.49382716049382713
Age F1 Macro score: 0.4355130993062028


In [375]:
from sklearn import svm

#print(test_features)

scores = []
result = pd.DataFrame.from_dict({'id': test['id']})

for class_name in class_names:
    train_target = train[class_name]
    test_target = test[class_name]
    classifier = svm.SVC(gamma='scale')

#     cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3))
#     scores.append(cv_score)
#     print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    result[class_name] = classifier.predict(test_features)
    print("{} Training score: {}".format( class_name.capitalize() ,classifier.score(train_features, train_target)))
    print("{} Testing score: {}".format( class_name.capitalize() ,classifier.score(test_features, test_target)))
     print("{} F1 Micro score: {}".format(class_name.capitalize() , f1_score(classifier.predict(test_features),test_target, average='micro')))
    print("{} F1 Macro score: {}".format(class_name.capitalize() , f1_score(classifier.predict(test_features),test_target, average='macro')))

# Word-Char features
# Gender Training score: 0.9966666666666667
# Gender Testing score: 0.6049382716049383
# Age Training score: 0.9833333333333333
# Age Testing score: 0.43209876543209874

# Word features
# Gender Testing score: 0.9966666666666667
# Gender Testing score: 0.6049382716049383
# Age Testing score: 0.9822222222222222
# Age Testing score: 0.4444444444444444

# Char features 
# Training score 0.9977777777777778
# Testing score 0.6296296296296297
# Training score 0.9844444444444445
# Testing score 0.4444444444444444

IndentationError: unexpected indent (<ipython-input-375-c1e5649ef7c9>, line 21)