In [1]:
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
sns.set(font_scale=1.5)

import matplotlib.pyplot as plt
import matplotlib.style as style
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

  from pandas import Panel


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!unzip '/content/drive/MyDrive/dataset/introml2020-3.zip' -d data

Archive:  /content/drive/MyDrive/dataset/introml2020-3.zip
  inflating: data/test.csv           
  inflating: data/train.csv          


In [3]:
train = pd.read_csv('/content/data/train.csv', nrows=1000000)

In [4]:
train.sentence.head(5)

0    Pensez à la communication , le discours , les ...
1    Můžete si ji pronajmout , vzít na splátky , ko...
2    Každý starosta pochopil , že když mají tyto fo...
3    Det är ytterligare bevis , men ändå — Jag krit...
4                                  كان الأمر لا يصدق .
Name: sentence, dtype: object

In [5]:
train.shape

(1000000, 2)

In [6]:
train.drop_duplicates(inplace=True)
train.shape

(869705, 2)

In [7]:
sent1 = 'гэта што калі мы не яны як але ён каб дзякуй больш пра на вы'
sent2 = 'що це тому ми чи дуже які щоб дякую якщо було вона від мене ось'
sent3 = 'это что мы не как на вы они но из то он так для аплодисменты'
sent4 = '笑聲 掌聲 謝謝 所以 現在 事實上 當然 因此 謝謝大家 對吧 但是 鼓掌 謝謝各位 他說 我說'
sent = set([sent1, sent2, sent3, sent4])

In [8]:
train = train[train['sentence'].map(lambda x: not x in sent)]

In [9]:
train.sentence = train.sentence.str.lower()

In [10]:
# все символы/токены, создающие шум (non-related chars)
punc_list_0 = [
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
    ':', ';', '<', '=', '>', '?', '@', '[', ']', '^', '_', '`', '{',
    '}', '~', '/', '—', '。', '・', '„' , '”',
    '？', '：', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
    '，',
] # `|` is not present here
punc_list_1 = [
    '\\', '&quot;', 'TED', '&apos;', 'quot', 'apos',
]

In [11]:
transtab = str.maketrans(dict.fromkeys(punc_list_0, ''))
train['sentence'] = '|'.join(train['sentence'].tolist()).translate(transtab).split('|')

In [12]:
for token in punc_list_1:
    train['sentence'] = train['sentence'].str.replace(token, '')

In [13]:
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer,
)
from sklearn.linear_model import LogisticRegression

In [14]:
X_train, y_train = train.sentence, train.language

In [15]:
labels = LabelEncoder()
y_train = labels.fit_transform(y_train)

In [16]:
tfidf = TfidfVectorizer(min_df=3)
tfidf_ngrams = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), min_df=3)

In [17]:
X_train = sp.hstack(
    (tfidf.fit_transform(X_train), 
     tfidf_ngrams.fit_transform(X_train)), 
    format='csr')

In [18]:
X_train.shape

(869699, 463430)

In [19]:
baseline_model = LogisticRegression(
    C=1.0,
    solver='saga',
    multi_class='multinomial',
    max_iter=50,
    n_jobs=-1,
)

In [20]:
baseline_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
del train, X_train, y_train

In [22]:
test = pd.read_csv('/content/data/test.csv')
test.shape

(2784634, 2)

In [27]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=17, shuffle=False)

In [24]:
test.sentence = test.sentence.str.lower()

In [25]:
test['sentence'] = '|'.join(test['sentence'].tolist()).translate(transtab).split('|')

In [26]:
for token in punc_list_1:
    test['sentence'] = test['sentence'].str.replace(token, '')

In [28]:
prediction = np.zeros(len(test))
for _, test_index in tqdm(kf.split(test.sentence)):
    X_test = test.sentence[test_index]
    X_test = sp.hstack(
       (tfidf.transform(X_test), 
        tfidf_ngrams.transform(X_test)), 
        format='csr',
    )
    prediction[test_index] = baseline_model.predict(X_test)

17it [06:27, 22.77s/it]


In [37]:
df = pd.DataFrame()
df['index'] = np.arange(len(prediction))
df['language'] = labels.inverse_transform(prediction.astype(int))

In [40]:
df.to_csv('submission_0.csv', index=None)