In [31]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV


### Loading the data

In [2]:
data = pd.read_csv('train_test_data.csv')

These were the columns - ${\bf translated\_from\_uk, translated\_from\_en}$ - that represented whether the pages existed in both wikipedias, and if so - which one appeared faster (so to say, was it translated from Ukrainian or from English). They could lead to overfitting because there is obvious connection with the target variable "is_translated". Also, the sign of the variable ${\bf days\_to\_translate}$ gives too much information to the classifier, so we decided to drop it while training.

In [3]:
data.drop(columns=['Unnamed: 0', 'translated_from_uk', 'translated_from_en', 'days_to_translate'], inplace=True)

In [4]:
data.head()

Unnamed: 0,uk_page_title,uk_translations_count,uk_incoming_links,uk_langlinks_count,outcoming_links,uk_revisions_count,uk_minor_revisions_count,uk_deleted_revisions,outcoming_links_translated,is_translated
0,!_(альбом_С.К.А.Й.),0.0,31.0,0.0,40.0,43.0,13.0,0.0,0.0,0
1,!_(альбом),0.0,0.0,12.0,12.0,10.0,1.0,0.0,0.0,1
2,!_(значення),0.0,1.0,16.0,32.0,7.0,5.0,0.0,0.0,1
3,!!_(значення),0.0,0.0,17.0,18.0,5.0,0.0,0.0,0.0,1
4,!!!,24.0,8.0,25.0,36.0,28.0,20.0,0.0,36.0,1


In [5]:
X, y = data.iloc[:,:-1].copy(), data.iloc[:, -1].copy()

In [6]:
X.drop(columns = ['uk_page_title'], inplace=True)

In [7]:
X.fillna(0, inplace=True)

### Data normalisation

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Splitting the data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y.values, test_size=0.7, random_state=42)

In [10]:
sum_all = X_test.shape[0]

#### Function for the results

In [18]:
def results_report(y_test, y_pred):
    #Confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('true negative:', round(tn * 100 / sum_all, 2), "%")
    print('false positive:', round(fp * 100 / sum_all, 2), '%')
    print('false negative:', round(fn * 100 / sum_all, 2), '%')
    print('true positive:', round(tp * 100 / sum_all, 2), '%')
    print('\n')
    
    #Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", round(accuracy * 100, 2), "%")
    print('\n')
    
    #Classification report
    print('Classification report:')
    print('\n')
    target_names = ['not translated', 'translated']
    print(classification_report(y_test, y_pred, target_names=target_names))

### XGBoost classifier

In [12]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [16]:
xgb_y_pred = xgb.predict(X_test)

  if diff:


In [19]:
results_report(y_test, xgb_y_pred)

true negative: 54.81 %
false positive: 3.94 %
false negative: 3.1 %
true positive: 38.15 %


Accuracy: 92.96 %


Classification report:


                precision    recall  f1-score   support

not translated       0.95      0.93      0.94    420573
    translated       0.91      0.92      0.92    295307

   avg / total       0.93      0.93      0.93    715880



### Logistic regression

In [20]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
log_y_pred = logistic.predict(X_test)

In [22]:
results_report(y_test, log_y_pred)

true negative: 55.86 %
false positive: 2.89 %
false negative: 8.73 %
true positive: 32.52 %


Accuracy: 88.38 %


Classification report:


                precision    recall  f1-score   support

not translated       0.86      0.95      0.91    420573
    translated       0.92      0.79      0.85    295307

   avg / total       0.89      0.88      0.88    715880



### Linear SVM

In [23]:
clf = svm.LinearSVC()
clf.fit(X_train, y_train)  

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [24]:
svm_y_pred = clf.predict(X_test)

In [25]:
results_report(y_test, svm_y_pred)

true negative: 56.47 %
false positive: 2.28 %
false negative: 12.32 %
true positive: 28.93 %


Accuracy: 85.4 %


Classification report:


                precision    recall  f1-score   support

not translated       0.82      0.96      0.89    420573
    translated       0.93      0.70      0.80    295307

   avg / total       0.86      0.85      0.85    715880



### Tuning XGBoost

In [32]:
clf = XGBClassifier(
                subsample=0.5,
                colsample_bytree=0.65,
                objective='reg:linear',
                n_estimators=999,
                learning_rate=0.1)

param_grid = {"max_depth": [3,4,5],
             #"gamma":list(np.linspace(0,0.1,num=5)),
             "min_child_weight":[2,4,6]}

grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X_train, y_train)
print(grid_search.grid_scores_)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


[mean: 0.93244, std: 0.00029, params: {'max_depth': 3, 'min_child_weight': 2}, mean: 0.93225, std: 0.00050, params: {'max_depth': 3, 'min_child_weight': 4}, mean: 0.93232, std: 0.00039, params: {'max_depth': 3, 'min_child_weight': 6}, mean: 0.93421, std: 0.00022, params: {'max_depth': 4, 'min_child_weight': 2}, mean: 0.93399, std: 0.00033, params: {'max_depth': 4, 'min_child_weight': 4}, mean: 0.93386, std: 0.00010, params: {'max_depth': 4, 'min_child_weight': 6}, mean: 0.93565, std: 0.00015, params: {'max_depth': 5, 'min_child_weight': 2}, mean: 0.93556, std: 0.00024, params: {'max_depth': 5, 'min_child_weight': 4}, mean: 0.93548, std: 0.00020, params: {'max_depth': 5, 'min_child_weight': 6}]




In [36]:
print(grid_search.best_params_, grid_search.best_score_)

{'max_depth': 5, 'min_child_weight': 2} 0.9356464203647268


In [37]:
clf = XGBClassifier(max_depth=5,
                    min_child_weight = 2,
                subsample=0.5,
                colsample_bytree=0.65,
                objective='reg:linear',
                n_estimators=999,
                learning_rate=0.1)

param_grid = {"gamma":list(np.linspace(0,0.5,num=5))}

grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X_train, y_train)


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.65,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=2, missing=None, n_estimators=999, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': [0.0, 0.125, 0.25, 0.375, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [38]:
print(grid_search.best_params_, grid_search.best_score_)

{'gamma': 0.0} 0.9356464203647268


In [39]:
clf = XGBClassifier(max_depth=5,
                    min_child_weight = 2,
                    gamma = 0,
                objective='reg:linear',
                n_estimators=999,
                learning_rate=0.1)

param_grid = {"subsample":[0.5, 0.6, 0.7],
             "colsample_bytree":[0.6, 0.7, 0.8]}

grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X_train, y_train)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=2, missing=None, n_estimators=999, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'colsample_bytree': [0.6, 0.7, 0.8], 'subsample': [0.5, 0.6, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [40]:
print(grid_search.best_params_, grid_search.best_score_)

{'colsample_bytree': 0.8, 'subsample': 0.6} 0.9359788790925833


In [41]:
clf = XGBClassifier(max_depth=5,
                    min_child_weight = 2,
                    gamma = 0,
                    colsample_bytree = 0.8,
                    subsample = 0.6,
                objective='reg:linear',
                n_estimators=999,
                learning_rate=0.1)

param_grid = {"reg_alpha": [0.01, 0.1]}

grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X_train, y_train)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=2, missing=None, n_estimators=999, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.6),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reg_alpha': [0.01, 0.1]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [42]:
print(grid_search.best_params_, grid_search.best_score_)

{'reg_alpha': 0.1} 0.9361874806473167


In [46]:
clf = XGBClassifier(max_depth=5,
                    min_child_weight = 2,
                    gamma = 0,
                    colsample_bytree = 0.8,
                    subsample = 0.6,
                    reg_alpha = 0.1,
                objective='reg:linear',
                n_estimators=999,
                learning_rate=0.1)


clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=2, missing=None, n_estimators=999, nthread=-1,
       objective='reg:linear', reg_alpha=0.1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.6)

In [47]:
xgb_tuned_y_pred = clf.predict(X_test)

  if diff:


In [48]:
results_report(y_test, xgb_tuned_y_pred)

true negative: 54.78 %
false positive: 3.97 %
false negative: 2.43 %
true positive: 38.82 %


Accuracy: 93.6 %


Classification report:


                precision    recall  f1-score   support

not translated       0.96      0.93      0.94    420573
    translated       0.91      0.94      0.92    295307

   avg / total       0.94      0.94      0.94    715880



### Evaluation

In [49]:
data.shape

(1022685, 10)

In [50]:
not_translated = data[data['is_translated'] == 0].copy()
translated = data[data['is_translated'] == 1].copy()

In [51]:
not_translated.shape, translated.shape

((600252, 10), (422433, 10))

Such was the initial fraction of not_translated to translated.

In [67]:
y_test.reshape(-1).shape

(715880,)

In [69]:
testing = np.append(X_test, y_test[:, None], axis = 1)

In [75]:
testing.shape

(715880, 9)

In [89]:
not_translated_test = testing[np.where(testing[:,-1] == 0), :][0][:,:-1]

In [90]:
not_translated_test.shape

(420573, 8)

In [93]:
clf.predict_proba(not_translated_test)

array([[ 9.7321486e-01,  2.6785135e-02],
       [ 1.0175729e+00, -1.7572939e-02],
       [ 9.9949604e-01,  5.0395727e-04],
       ...,
       [ 9.9847233e-01,  1.5276372e-03],
       [ 9.9949604e-01,  5.0395727e-04],
       [ 9.9949604e-01,  5.0395727e-04]], dtype=float32)