In [150]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.model_selection import ShuffleSplit, cross_val_score, KFold
from scipy import sparse
%matplotlib inline

In [212]:
train = pd.read_csv('linear_train.txt', sep=',', header=None)
train.columns = ['word', 'y']
test = pd.read_csv('linear_test.txt', sep=',', header=None)
test.columns = ['word']
print(train.head())
print(test.head())
test.shape

        word  y
0  ﻿Аалтонен  1
1        Аар  0
2      Аарон  0
3      ААРОН  0
4     Аарона  0
     word
0  ﻿Аалто
1     ААР
2    Аара
3    Ааре
4   Аарон


(188920, 1)

In [213]:
train['word'] = train['word'].apply(lambda x: '*' + x + '#')
test['word'] = test['word'].apply(lambda x: '*' + x + '#')

In [214]:
train.head()

Unnamed: 0,word,y
0,*﻿Аалтонен#,1
1,*Аар#,0
2,*Аарон#,0
3,*ААРОН#,0
4,*Аарона#,0


In [141]:
def get_lens_features(df):
    lens = pd.DataFrame()
    for i in range(5, 20):
        lens[str(i)] = df['word'].apply(lambda x: 1 if (len(x) == i) else 0)
    return lens
get_lens_features(train).as_matrix().shape

(101408, 15)

In [3]:
not_surnames_set = set(train[train['y'] == 0]['word'])
surnames_set = set(train[train['y'] == 1]['word'])
print(len(not_surnames_set))
len(surnames_set)

90770


10638

In [217]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=.7,
                             max_features=None,
                             ngram_range=(2, 5),
                             lowercase=False,
                             analyzer='char_wb', 
                             binary=True)
vectorizer.fit(train['word'])
#vectorizer.fit(pd.concat([train['word'], test['word']], axis=0))
X = vectorizer.transform(train['word'])
print(X.shape)
X = sparse.hstack([X, get_lens_features(train).as_matrix()])
X_test = vectorizer.transform(test['word'])
print(X_test.shape)
X_test = sparse.hstack([X_test, get_lens_features(test).as_matrix()])
print(X_test.shape)
X.shape

(101408, 82979)
(188920, 82979)
(188920, 82994)


(101408, 82994)

In [225]:
for C in [.2, .15, .1, .05]:
    print(cross_val_score(LogisticRegression(penalty='l2',class_weight='balanced',max_iter=300,C=C), X, train['y'], 
                cv=ShuffleSplit(n_splits=5, test_size=0.6, random_state=0), 
                scoring='roc_auc').mean())

0.904442748052
0.904831956423
0.905006757268
0.904107448837


In [73]:
from collections import Counter
def generate_features(W_train, n_features=4, n_top=1000):
    cntr = Counter(W_train.apply(lambda x: x[-4:]))
    print(cntr.most_common(10))
    for i in range(n_top):
        X_i = W_train.apply(lambda w: 1 if w.endswith(cntr.most_common(n_top)[i][0]) else 0)
        print(X_i)
    #X = W_train.apply()

generate_features(train['word'], n_top=1)

[('ость', 1357), ('ками', 1132), ('ание', 1072), ('нием', 959), ('ости', 934), ('ение', 855), ('ания', 768), ('стью', 710), ('ения', 617), ('иями', 561)]


KeyboardInterrupt: 

In [237]:
clf = LogisticRegression(penalty='l2', class_weight='balanced', max_iter=300, C=.1)
clf.fit(X, train['y'])
#roc_auc_score(test['y'], clf.predict(X_test))
roc_auc_score(train['y'], clf.predict(X))

0.92274761998943544

In [238]:
res = clf.predict_proba(X_test)
#res = np.array(res).max(axis=1)
res[:10]

array([[ 0.34198266,  0.65801734],
       [ 0.23194685,  0.76805315],
       [ 0.33672616,  0.66327384],
       [ 0.61904632,  0.38095368],
       [ 0.33843261,  0.66156739],
       [ 0.45350457,  0.54649543],
       [ 0.25894732,  0.74105268],
       [ 0.40797426,  0.59202574],
       [ 0.40797426,  0.59202574],
       [ 0.40461412,  0.59538588]])

In [240]:
submission = pd.read_csv("linear_ans_example.txt")
submission['Id'] = [i for i in range(len(res))]
submission['Answer'] = 1 - res
#submission.head(50)
submission.to_csv('logregr_ngram_2_5_suf_pref_len_c01_balanced.txt', index=False)
submission.head()

Unnamed: 0,Id,Answer
0,0,0.658017
1,1,0.768053
2,2,0.663274
3,3,0.380954
4,4,0.661567


In [182]:
# SGDClassifier
clf = SGDClassifier(penalty='l2', loss='log')
clf.fit(X, train['y'])
X_test = vectorizer.transform(test['word'])
test['y'] = clf.predict(X_test)

In [196]:
test2 = test.copy()
def check_in_train(df):
    if (df[0] in not_surnames_set):
        df[1] = 0
    else:
        if (df[0] in surnames_set):
            df[1] = 1
    return df

test2[['word', 'y']] = test2[['word', 'y']].apply(lambda row: check_in_train(row), axis=1)
test2

Unnamed: 0,word,y
0,﻿Аалто,0
1,ААР,0
2,Аара,0
3,Ааре,0
4,Аарон,0
5,Аароне,0
6,Ааронов,0
7,Аароном,0
8,Аароном,0
9,Аарону,0


In [197]:
test[test2['y'] != test['y']]

Unnamed: 0,word,y
545,АВГУСТ,0
891,автокаско,0
948,автоматов,0
1265,агент,0
1290,агентство,0
1433,агрегирования,0
1928,Азов,0
2474,акт,0
2608,акты,0
2838,алгоритмы,0


In [199]:
test2[test2['y'] != test['y']]

Unnamed: 0,word,y
545,АВГУСТ,1
891,автокаско,1
948,автоматов,1
1265,агент,1
1290,агентство,1
1433,агрегирования,1
1928,Азов,1
2474,акт,1
2608,акты,1
2838,алгоритмы,1


In [200]:
ans = test2.copy()
ans.drop('word', inplace=True, axis=1)
ans.columns = ['Answer']
ans.to_csv('SGD_3_8_l2_log_useknown.txt')

In [170]:
cross_val_score(RidgeClassifier(normalize=True), X, train['y'], 
                cv=ShuffleSplit(n_splits=3, test_size=0.6, random_state=0), 
                scoring='roc_auc').mean()

0.89222263755976072

In [172]:
cross_val_score(svm.SVC(), X, train['y'], 
                cv=ShuffleSplit(n_splits=3, test_size=0.6, random_state=0), 
                scoring='roc_auc').mean()

0.87390083894683634

In [171]:
from sklearn import svm

In [None]:
svm.SVC()