In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.utils import resample
from sklearn.grid_search import GridSearchCV
import re
import xgboost as xgb

In [29]:
data = pd.read_csv("train.csv")
df_majority = data[data.Label==0]
df_minority = data[data.Label==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
df_majority_downsampled = resample(df_majority,
                                  replace=True,
                                  n_samples=df_minority.shape[0],
                                  random_state=123)
data = pd.concat([df_majority, df_minority_upsampled])
# size = 100000
# a = resample(df_minority,
#              replace=True,
#              n_samples=size,
#              random_state=123)
# b = resample(df_majority,
#             replace=True,
#             n_samples=size,
#             random_state=123)
# # data = pd.concat([df_minority, df_majority_downsampled])
# data = pd.concat([a,b])
data = data.sample(frac=1)
data = data.reset_index(drop=False).drop(["index"], axis=1)


In [30]:
data["Word"] = data["Word"].apply(lambda x: x.lower())
data = data[data["Word"].apply(lambda x: is_only_ru(x))==1]

vowel = u'уеёыаоэяию'
consonant = u'йцкнгшщзхъфвпрлджчсмтьб'
ru_letter = vowel + consonant + u"-’'"
last_letter = list(set(data["Word"].apply(lambda x: x[-1]).get_values()))
last_2_letter = list(set(data["Word"].apply(lambda x: x[-2:]).get_values()))
last_3_letter = list(set(data["Word"].apply(lambda x: x[-3:]).get_values()))

def count_vow(word):
    vow = 0
    for i in word:
        if i in vowel:
            vow += 1
    return vow

def count_con(word):
    cons = 0
    for i in word:
        if i in consonant:
            cons += 1
    return cons

def is_only_ru(word):
    return len(set(word) - set(ru_letter)) == 0


def last_letters(data):
    for letter in  last_letter:
        data["last1" + letter] = data["Word"].apply(lambda x: x[-1] == letter)
    for letter in last_2_letter:
        data["last2" + letter] = data["Word"].apply(lambda x: x[-2:] == letter)
#     for letter in last_3_letter:
#         data["last2" + letter] = data["Word"].apply(lambda x: x[-3:] == letter)
    return data

In [31]:
def add_features(data):
    data["len"] = data["Word"].apply(lambda x: len(x))
    data["is_len_more_11"] = data["len"].apply(lambda x: x > 11)
    data["vow_count"] = data["Word"].apply(lambda x: count_vow(x))
    data["cons_count"] = data["Word"].apply(lambda x: count_con(x))
    data["last_vow"] = data["Word"].apply(lambda x: x[-1] in vowel)
    data["prelast_vow"] = data["Word"].apply(lambda x: len(x) > 2 and x[-2] in vowel)
    data["ohara"] = data["Word"].apply(lambda x: x[:2] == "о'")
    data = last_letters(data)
    return data

In [32]:
data = add_features(data)
X = data.drop(["Label", "Word"], axis=1)
y = data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
regr = xgb.XGBClassifier(colsample_bytree=0.97, max_depth=10, n_estimators=90, subsample=0.97)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(roc_auc_score(y_pred, y_test.get_values()))

0.76144292942


In [33]:
# classifier = xgb.XGBClassifier()
# # {'max_depth': 11, 'subsample': 0.9}
# # {'colsample_bytree': 0.95, 'max_depth': 10, 'n_estimators': 80, 'subsample': 0.97}
# # 0.7825313337289386
# params = {
#     'max_depth': [9, 10, 11],
#     'subsample': [0.97, 0.99, 1],
#     'n_estimators': [70, 80, 90],
#     'colsample_bytree': [0.95, 0.97, 1]
# }
# grid_search = GridSearchCV(classifier, params, n_jobs=-1, verbose=1, scoring='roc_auc')
# grid_search.fit(X, y)
# print(grid_search.best_params_)
# print(grid_search.best_score_)

In [34]:
# classifier = RandomForestClassifier()
# params = {
#     'max_depth': [22, 25, 30, 40, 50, 70, 100],
#     'n_estimators' : [40, 50, 60, 100],
#     'max_features' : ['log2', None, 'sqrt']
# }
# grid_search = GridSearchCV(classifier, params, n_jobs=-1, verbose=1, scoring='roc_auc')
# grid_search.fit(X, y)
# print(grid_search.best_params_)
# print(grid_search.best_score_)

In [35]:
test = pd.read_csv("test.csv")
test["Word"] = test["Word"].apply(lambda x: x.lower())
test = add_features(test).drop(["Word"], axis=1)
pred = regr.predict(test)
# test
pred

array([1, 1, 1, ..., 0, 0, 0])

In [36]:
pd.Series(pred).to_csv("submission_xgboost.csv")