In [None]:
import pandas as pd
from sklearn.preprocessing  import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook

In [None]:
# открываем колабу доступ к гугл-диску, чтобы загрузить датасет
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/train.csv")
data_test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/test.csv")

Реализуем пайплайн из TfIdfVectorizer и SGDClassifier с логистической функцией потерь. Для улучшения модели используем бэггинг.

Так как данных чрезвычайно много, для ускорения вычислений используем только 1/2 датасета, разбив его на три части для реализации бэггинга.

In [None]:
part = data_train.iloc[:int(data_train.shape[0] / 6)]

In [None]:
x_train = part.values[:,0]
label_encoder = LabelEncoder().fit(part.values[:, 1])
y_train = label_encoder.transform(part.values[:, 1])

In [None]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(lowercase=True, ngram_range=(3, 5), analyzer='char', min_df=5, max_df=0.5)),
    ('model', SGDClassifier(loss='log', class_weight='balanced'))
], verbose=True)

In [None]:
pipe.fit(x_train, y_train)

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total= 3.2min
[Pipeline] ............. (step 2 of 2) Processing model, total= 6.1min


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=5, ngram_range=(3, 5), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 SGDClassifier(alpha=0.0001, average=False,
                               class_weight='balanced', early_stopping=False,
                               epsilon=0.1, eta0=0.0, fit_intercept=True,
                               l1_ratio=0.15, learning_rate='optimal',
                   

In [None]:
submit_predictions = pipe.predict(data_test.sentence.values)
data_test['language'] = label_encoder.classes_[submit_predictions]
(data_test[['index', 'language']].to_csv('language_predicted.csv', index=False))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Далее аналогичным образом обучаем модель на второй и третьей 1/6 датасета.

Получаем 3 файла предикта - lang1, lang2, lang3.

Следующий этап - голосование. Реализуем следующим образом:

In [None]:
lang_count = (
    data_train.language.value_counts()
    .to_frame()
    .reset_index()
)

lang_count.head()

In [None]:
lang_freq_dict = {}
for ind in range(lang_count.shape[0]):
    lang_freq_dict[lang_count['index'][ind]] = lang_count['lang1'][ind]/data_train.shape[0]

In [None]:
def voting(a, b, c, d):
  if (a == b) or (a == c):
    return a
  elif b == c:
    return b
  else:
    if (d[a] == max(d[a], d[b], d[c])):
      return a
    elif (d[b] == max(d[a], d[b], d[c])):
      return b
    else:
      return c

In [None]:
def summ_vote(df1, df2, df3, d):
  df_new = df1.copy()
  for i in tqdm_notebook(range(df1.shape[0])):
    df_new['language'][i] = voting(df1['language'][i], df2['language'][i], df3['language'][i], d)
  return df_new

In [None]:
lang1 = pd.read_csv("/content/drive/My Drive/Colab Notebooks/lang1.csv")
lang2 = pd.read_csv("/content/drive/My Drive/Colab Notebooks/lang2.csv")
lang3 = pd.read_csv("/content/drive/My Drive/Colab Notebooks/lang3.csv")

In [None]:
final_predict = summ_vote(lang1, lang2, lang3, lang_freq_dict)
(final_predict[['index', 'language']].to_csv('predicted.csv', index=False))