In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import auc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import collections
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('linear_train.txt', header=None)
tst = pd.read_csv('linear_test.txt', header=None)

In [3]:
def feature_fabric(data):
    data['is_first_upper'] = data[0].map(lambda x: (x[0].isupper() and not x.isupper())*1.)
    data[0] = data[0].map(lambda x: x.lower())
    for i in np.arange(2, 6):
        data['last_'+str(i)+'chars'] = data[0].map(lambda x: x[-i:] if len(x) > i else x)
    data['counter'] = data[0].map(lambda x: collections.Counter(x))
    for i in np.arange(32):
        letter = chr(i+ord('а'))
        data[letter] = data['counter'].map(lambda x: x[letter])
    data.drop('counter', axis=1, inplace=True)

In [4]:
feature_fabric(data)
feature_fabric(tst)

In [5]:
active_endings = set(tst[['last_'+str(i)+'chars' for i in np.arange(2, 5)]].as_matrix().reshape(3*len(tst)))
active_endings = active_endings.intersection(set(data[['last_'+str(i)+'chars' for i in np.arange(2, 5)]].as_matrix().reshape(3*len(data))))

In [6]:
for i in np.arange(2, 5):
    data['last_'+str(i)+'chars'] = data['last_'+str(i)+'chars'].map(lambda x: x if x in active_endings else '  A')
    tst['last_'+str(i)+'chars'] = tst['last_'+str(i)+'chars'].map(lambda x: x if x in active_endings else '  A')

In [7]:
oldata = data.copy()
data.head()

Unnamed: 0,0,1,is_first_upper,last_2chars,last_3chars,last_4chars,last_5chars,а,б,в,...,ц,ч,ш,щ,ъ,ы,ь,э,ю,я
0,аалтонен,1,1.0,ен,нен,A,тонен,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,аар,0,1.0,ар,аар,аар,аар,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,аарон,0,1.0,он,рон,арон,аарон,2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,аарон,0,0.0,он,рон,арон,аарон,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,аарона,0,1.0,на,она,рона,арона,3,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
encoder1 = LabelEncoder()
encoder1.fit([i for i in active_endings]+['  A'])
for i in np.arange(2, 5):
    data['last_'+str(i)+'chars'] = encoder1.transform(data['last_'+str(i)+'chars'])
    tst['last_'+str(i)+'chars'] = encoder1.transform(tst['last_'+str(i)+'chars'])

In [9]:
encoder = OneHotEncoder(sparse=False)
new_features_train = encoder.fit_transform(data[['last_'+str(i)+'chars' for i in np.arange(2, 5)]])
new_features_test = encoder.transform(tst[['last_'+str(i)+'chars' for i in np.arange(2, 5)]])

In [10]:
new_features_train = pd.DataFrame(new_features_train)
new_features_test = pd.DataFrame(new_features_test)

In [11]:
new_features_train.columns = new_features_test.columns = encoder.active_features_

In [12]:
for i in np.arange(32):
    letter = chr(i+ord('а'))
    new_features_train[letter] = data[letter]
    new_features_test[letter] = tst[letter]
new_features_train['is_first_upper'] = data['is_first_upper']
new_features_test['is_first_upper'] = tst['is_first_upper']

In [13]:
new_features_train.head()

Unnamed: 0,0,1,16,29,30,33,35,36,51,107,...,ч,ш,щ,ъ,ы,ь,э,ю,я,is_first_upper
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.0


In [14]:
data.head(n=100)

Unnamed: 0,0,1,is_first_upper,last_2chars,last_3chars,last_4chars,last_5chars,а,б,в,...,ц,ч,ш,щ,ъ,ы,ь,э,ю,я
0,аалтонен,1,1.0,4370,9717,0,тонен,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,аар,0,1.0,922,49,49,аар,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,аарон,0,1.0,11035,13051,1016,аарон,2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,аарон,0,0.0,11035,13051,1016,аарон,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,аарона,0,1.0,9470,11036,13052,арона,3,0,0,...,0,0,0,0,0,0,0,0,0,0
5,аарона,1,1.0,9470,11036,13052,арона,3,0,0,...,0,0,0,0,0,0,0,0,0,0
6,аароне,0,1.0,9671,11058,13053,ароне,2,0,0,...,0,0,0,0,0,0,0,0,0,0
7,ааронов,0,1.0,10391,9954,11096,ронов,2,0,1,...,0,0,0,0,0,0,0,0,0,0
8,аахена,0,1.0,9470,4371,16407,ахена,3,0,0,...,0,0,0,0,0,0,0,0,0,0
9,абабков,1,1.0,10391,7491,1745,абков,2,2,1,...,0,0,0,0,0,0,0,0,0,0


In [21]:
new_features_test[encoder.active_features_[4953]]*=0.

In [22]:
X = new_features_train
y = data[1]

In [None]:
algo = LogisticRegression(penalty='l1', C=0.1)
arr = cross_val_score(algo, X, y, cv=5, scoring='roc_auc')
arr