In [2]:
from google.colab import drive
import pandas as pd
from catboost import CatBoostClassifier
from google.colab import files

In [3]:
drive.mount('/content/drive')

# Chemins vers les fichiers
train_path = '/content/drive/My Drive/machinelearning/train.csv'
test_path = '/content/drive/My Drive/machinelearning/test.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Charger les données par échantillonnage
sample_fraction = 0.1
train = pd.read_csv(train_path, nrows=int(5000000))
test = pd.read_csv(test_path)

# Échantillonnage aléatoire des données
train_sample = train.sample(frac=sample_fraction, random_state=1)

print(train_sample.head())
print(test.head())

# Transformation de la colonne 'hour' en datetime
train_sample['hour'] = pd.to_datetime(train_sample['hour'], format='%y%m%d%H')
test['hour'] = pd.to_datetime(test['hour'], format='%y%m%d%H')

# Extraire des features temporelles
train_sample['day'] = train_sample['hour'].dt.day
train_sample['hour'] = train_sample['hour'].dt.hour

test['day'] = test['hour'].dt.day
test['hour'] = test['hour'].dt.hour

# Sélection des features et de la target
X = train_sample.drop(['id', 'click'], axis=1)
y = train_sample['click']

X_test = test.drop(['id'], axis=1)

cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

model = CatBoostClassifier(iterations=1000, depth=10, learning_rate=0.1, loss_function='Logloss', verbose=200)

model.fit(X, y, cat_features=cat_features)

predictions = model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({'id': test['id'], 'click': predictions})
submission.to_csv('submission.csv', index=False)

files.download('submission.csv')

                   id  click      hour    C1  banner_pos   site_id  \
1903699  1.371458e+19      0  14102109  1005           0  1fbe01fe   
1139078  6.578865e+18      0  14102105  1005           0  85f751fd   
1466517  1.247935e+19      1  14102107  1005           0  1fbe01fe   
1723903  1.640883e+19      1  14102108  1005           1  e151e245   
4876944  1.778986e+19      0  14102205  1002           0  9e8e8d09   

        site_domain site_category    app_id app_domain  ... device_type  \
1903699    f3845767      28905ebd  ecad2386   7801e8d9  ...           1   
1139078    c4e18dd6      50e219e0  5fc17a6a   2347f47a  ...           1   
1466517    f3845767      28905ebd  ecad2386   7801e8d9  ...           1   
1723903    7e091613      f028772b  ecad2386   7801e8d9  ...           1   
4876944    16a36ef3      50e219e0  ecad2386   7801e8d9  ...           0   

        device_conn_type    C14  C15  C16   C17  C18  C19     C20  C21  
1903699                0  21725  320   50  2502    0   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# Créer le fichier de soumission
submission = pd.DataFrame({'id': test['id'], 'click': predictions})
submission['id'] = submission['id'].apply(lambda x: f'{x:.0f}')
print(submission.head(10))
submission.to_csv('submission.csv', index=False)

                     id     click
0  10000174058809264128  0.126911
1  10000182526920855552  0.258380
2  10000554139829213184  0.147155
3  10001094637809799168  0.045183
4  10001377041558671360  0.206341
5  10001521204153354240  0.236325
6  10001911056707022848  0.100234
7  10001982898844213248  0.042194
8  10002000217531287552  0.024202
9  10002107385290586112  0.133548


In [7]:
# Télécharger le fichier de soumission
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>