In [None]:
import numpy as np
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

!pip install catboost
from catboost import CatBoostClassifier


from google.colab import drive 

drive.mount('/content/drive')

In [3]:
raw_data = pd.read_csv('/content/drive/MyDrive/hacks-ai/orenburg/data/train_dataset_train.csv')

In [4]:
def digitize_strings(data, column_names):
    for column_name in column_names:
        values = set()
        for value in data[column_name]:
            if (not value in values):
                values.add(value)
    
        values_codes = {}
        index = 1
        for value in values:
            values_codes[value] = index
            index += 1
    
        for idx, value in enumerate(data[column_name]):
            data.at[idx, column_name] = values_codes[value]


def digitize_dates(data, column_names):
    for column_name in column_names:
        for idx, value in enumerate(data[column_name]):
            fields = value.split('/')
            if len(fields) == 3:
                data.at[idx, column_name] = int(fields[2]) * 365 + int(fields[1]) * 30 + int(fields[0])
            else:
                data.at[idx, column_name] = 0


def calc_accuracy(y_pred, y_target):
    count = 0
    for index, _ in enumerate(y_pred):
        if y_pred[index] == y_target[index]:
            count += 1

    return count/len(y_pred)


In [5]:
dropped_features = ['spent_time_to_complete_hw',
                    'failed_hw',
                    'month_id',
                    'avg_quiz_result',
                    'city',
                    'hw_leader']
                    

string_features = ['promo',
                   'country',
                   'communication_type',
                   'ABC',
                   'os',
                   'browser',
                   'platform']

date_features = ['carts_created_at']

In [6]:
X = raw_data.drop(columns=dropped_features)
X = X.drop(columns=['target'])
digitize_strings(X, string_features)
digitize_dates(X, date_features)

y = raw_data['target']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [10]:
model = CatBoostClassifier(iterations=17000,
                           learning_rate=0.8, 
                           task_type='GPU',
                           devices='0:1')

In [None]:
model.fit(X_train, y_train)#, save_snapshot=True, snapshot_interval=10)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Train accuracy: ", calc_accuracy(y_pred_train, y_train.values))
print("Valid accuracy:", calc_accuracy(y_pred_test, y_test.values))

print(0.2 * metrics.recall_score(y_test.values, y_pred_test, average='macro') + 0.8 * metrics.precision_score(y_test.values, y_pred_test, average='macro'))


In [None]:
model.save_model('/content/drive/MyDrive/hacks-ai/orenburg/models/classifier_v13_catboost.cbm')

In [None]:
raw_test_data = pd.read_csv('/content/drive/MyDrive/hacks-ai/orenburg/data/test_dataset_test.csv')

In [None]:
X_solution = raw_test_data.drop(columns=dropped_features)
digitize_strings(X_solution, string_features)
digitize_dates(X_solution, date_features)

In [None]:
y_solution = model.predict(X_solution)

In [None]:
solution = raw_test_data.filter(items=['id'])
solution['target'] = y_solution
solution.astype(int).to_csv('/content/drive/MyDrive/hacks-ai/orenburg/solutions/solution_v13_catboost_9458-8416.csv', index=False)