In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.grid_search import GridSearchCV
import re
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import xgboost as xgb
pd.options.mode.chained_assignment = None

In [4]:
data = pd.read_csv("train.csv")
df_majority = data[data.Label==0]
df_minority = data[data.Label==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
# df_majority_downsampled = resample(df_majority,
#                                   replace=True,
#                                   n_samples=df_minority.shape[0],
#                                   random_state=123)
data = pd.concat([df_majority, df_minority_upsampled])
size = 3000
a = resample(df_minority,
             replace=True,
             n_samples=size,
             random_state=123)
b = resample(df_majority,
            replace=True,
            n_samples=size,
            random_state=123)
# data = pd.concat([df_minority, df_majority_downsampled])
data = pd.concat([a,b])
data = data.sample(frac=1)
data = data.reset_index(drop=False).drop(["index"], axis=1)


In [3]:
# data["Word"] = data["Word"].apply(lambda x: x.lower())

vowel = u'уеёыаоэяию'
consonant = u'йцкнгшщзхъфвпрлджчсмтьб'
ru_letter = vowel + consonant + u"-’'"
last_letter = list(set(data["Word"].apply(lambda x: x[-1]).get_values()))
last_2_letter = list(set(data["Word"].apply(lambda x: x[-2:]).get_values()))
last_3_letter = list(set(data["Word"].apply(lambda x: x[-3:]).get_values()))

def count_vow(word):
    vow = 0
    for i in word:
        if i in vowel:
            vow += 1
    return vow

def count_con(word):
    cons = 0
    for i in word:
        if i in consonant:
            cons += 1
    return cons

def is_only_ru(word):
    return len(set(word) - set(ru_letter)) == 0


def last_letters(data):
    for letter in  last_letter:
        data["last1" + letter] = data["Word"].apply(lambda x: x[-1] == letter)
    for letter in last_2_letter:
        data["last2" + letter] = data["Word"].apply(lambda x: x[-2:] == letter)
#     for letter in last_3_letter:
#         data["last2" + letter] = data["Word"].apply(lambda x: x[-3:] == letter)
    return data

In [4]:
def add_onotoles_old_features(data):
    data = data[data["Word"].apply(lambda x: is_only_ru(x))==1]
    data["len"] = data["Word"].apply(lambda x: len(x))
    data["is_len_more_11"] = data["len"].apply(lambda x: x > 11)
    data["vow_count"] = data["Word"].apply(lambda x: count_vow(x))
    data["cons_count"] = data["Word"].apply(lambda x: count_con(x))
    data["last_vow"] = data["Word"].apply(lambda x: x[-1] in vowel)
    data["prelast_vow"] = data["Word"].apply(lambda x: len(x) > 2 and x[-2] in vowel)
    data["ohara"] = data["Word"].apply(lambda x: x[:2] == "о'")
    data = last_letters(data)
    return data

In [8]:
X = data["Word"]
y = data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(2,7))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
#                      ('clf', xgb.XGBClassifier(colsample_bytree=0.97, max_depth=10, n_estimators=90, subsample=0.97))
                     ('clf', SGDClassifier(alpha=1e-4, penalty='l2'))#alpha=1e-4, penalty='l2', class_weight='balanced')),
                    ])
# print( cross_val_score(text_clf, X=X, y=y, scoring='roc_auc').mean())
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(roc_auc_score(y_pred, y_test))

0.786957779626173




In [19]:
vect = CountVectorizer(analyzer='char_wb', ngram_range=(2,7))
tfidf = TfidfTransformer(use_idf=True)
clf = LogisticRegression(C=2, class_weight='balanced')

count_transformer = vect.fit(X_train)
counts = count_transformer.transform(X_train)
tfidf_transformer = tfidf.fit(counts)
data_tfidf = tfidf_transformer.transform(counts)
clf.fit(data_tfidf, y_train)

c_2 = count_transformer.transform(X_test)
tfidf_2 = tfidf_transformer.transform(c_2)

y_pred = clf.predict(tfidf_2)
print(roc_auc_score(y_pred, y_test))

0.7908117305575018


In [21]:
y_pred = clf.predict_proba(tfidf_2)
y_pred

array([[0.71650913, 0.28349087],
       [0.50270216, 0.49729784],
       [0.11621142, 0.88378858],
       ...,
       [0.83080795, 0.16919205],
       [0.23931248, 0.76068752],
       [0.27013066, 0.72986934]])

In [None]:
text_clf = text_clf.fit(X, y)
test = pd.read_csv("test.csv")
test = test["Word"].apply(lambda x: x.lower())
pred = text_clf.predict(test)
pd.Series(pred).to_csv("submission.csv")

In [None]:
pd.Series(pred).value_counts()

In [None]:
# {'vect__ngram_range': (2, 7), 'clf__alpha': 1e-05}
# 0.896830120806
# parameters = {'vect__ngram_range': [(2,7)],
# #               'tfidf__use_idf': True,
#               'clf__alpha': [1e-4],
#               'clf__penalty': ['l2', 'l1'],
#               'clf__class_weight': ['balanced']
#              }
parameters = {
    'clf__n_jobs': [-1],
    'clf__max_depth': [22, 25, 30, 40, 50, 70, 100, 120, 130],
    'clf__n_estimators' : [40, 50, 60, 100, 200]
#     'clf__max_features' : ['log2', None, 'sqrt']
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1, scoring='roc_auc')
gs_clf = gs_clf.fit(X, y)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 83.8min


In [None]:
data = add_features(data)
X = data.drop(["Label", "Word"], axis=1)
y = data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
regr = xgb.XGBClassifier(colsample_bytree=0.97, max_depth=10, n_estimators=90, subsample=0.97)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(roc_auc_score(y_pred, y_test.get_values()))

In [None]:
# classifier = xgb.XGBClassifier()
# # {'max_depth': 11, 'subsample': 0.9}
# # {'colsample_bytree': 0.95, 'max_depth': 10, 'n_estimators': 80, 'subsample': 0.97}
# # 0.7825313337289386
# params = {
#     'max_depth': [9, 10, 11],
#     'subsample': [0.97, 0.99, 1],
#     'n_estimators': [70, 80, 90],
#     'colsample_bytree': [0.95, 0.97, 1]
# }
# grid_search = GridSearchCV(classifier, params, n_jobs=-1, verbose=1, scoring='roc_auc')
# grid_search.fit(X, y)
# print(grid_search.best_params_)
# print(grid_search.best_score_)

In [None]:
# classifier = RandomForestClassifier()
# params = {
#     'max_depth': [22, 25, 30, 40, 50, 70, 100],
#     'n_estimators' : [40, 50, 60, 100],
#     'max_features' : ['log2', None, 'sqrt']
# }
# grid_search = GridSearchCV(classifier, params, n_jobs=-1, verbose=1, scoring='roc_auc')
# grid_search.fit(X, y)
# print(grid_search.best_params_)
# print(grid_search.best_score_)

In [None]:
test = pd.read_csv("test.csv")
test["Word"] = test["Word"].apply(lambda x: x.lower())
test = add_features(test).drop(["Word"], axis=1)
pred = regr.predict(test)
# test
pred

In [None]:
pd.Series(pred).to_csv("submission_xgboost.csv")

In [25]:
pd.Series([1,2,3]).to_csv("test.csv", index_label=["Id","Prediction"], header=True)

In [26]:
! cat test.csv

Id,Prediction,0
0,1
1,2
2,3
