In [142]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.utils import resample
from sklearn.grid_search import GridSearchCV
import re
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [125]:
data = pd.read_csv("train.csv")
df_majority = data[data.Label==0]
df_minority = data[data.Label==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
data = pd.concat([df_majority, df_minority_upsampled])

In [126]:
vowel = u'уеёыаоэяию'
consonant = u'йцкнгшщзхъфвпрлджчсмтьб'
ru_letter = vowel + consonant + "-’'"
last_letter = list(set(data["Word"].apply(lambda x: x[-1]).get_values()))
last_2_letter = list(set(data["Word"].apply(lambda x: x[-2:]).get_values()))

def count_vow(word):
    vow = 0
    for i in word:
        if i in vowel:
            vow += 1
    return vow

def count_con(word):
    cons = 0
    for i in word:
        if i in consonant:
            cons += 1
    return cons

def is_only_ru(word):
    return len(set(word) - set(ru_letter)) == 0


def last_letters(data):
    for letter in  last_letter:
        data["last1" + letter] = data["Word"].apply(lambda x: x[-1] == letter)
    for letter in last_2_letter:
        data["last2" + letter] = data["Word"].apply(lambda x: x[-2:] == letter)
    return data

In [127]:
def add_features(data):
    data["Word"] = data["Word"].apply(lambda x: x.lower())#.decode('utf-8').lower())
    data["len"] = data["Word"].apply(lambda x: len(x))
    data["vow_count"] = data["Word"].apply(lambda x: count_vow(x))
    data["cons_count"] = data["Word"].apply(lambda x: count_con(x))
    data["last_vow"] = data["Word"].apply(lambda x: x[-1] in vowel)
    data["prelast_vow"] = data["Word"].apply(lambda x: len(x) > 2 and x[-2] in vowel)
    data["only_ru"] = data["Word"].apply(lambda x: is_only_ru(x))
#     last_2_letter = pd.get_dummies(data["Word"].apply(lambda x: x[-2:]), prefix="last2")
#     last_1_letter = pd.get_dummies(data["Word"].apply(lambda x: x[-1:]), prefix="last1")
#     data = pd.concat([data,last_2_letter,last_1_letter], axis=1)
    data = last_letters(data)
    return data

In [133]:
data = add_features(data)
X = data.drop(["Label", "Word"], axis=1)
y = data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
regr = SGDClassifier()
regr = RandomForestClassifier(max_depth=3)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(roc_auc_score(y_pred, y_test.get_values()))

0.706467681283


In [141]:
classifier = RandomForestClassifier()
params = {
    'max_depth': [22, 25, 30, 40,50 ],
    'n_estimators' : [40, 50, 60]
}
grid_search = GridSearchCV(classifier, params, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  6.0min finished


{'max_depth': 50, 'n_estimators': 60}
0.7926004336066128


In [131]:
test = add_features(pd.read_csv("test.csv")).drop(["Word"], axis=1)
pred = regr.predict(test)
# test
pred

array([0, 0, 0, ..., 0, 0, 0])

In [132]:
pd.Series(pred).to_csv("submission.csv")