In [81]:
# https://platform.olimpiada-ai.ro/problems/64

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [82]:
train = pd.read_csv("/kaggle/input/chirper-melon-husk/train.csv")
test = pd.read_csv("/kaggle/input/chirper-melon-husk/test.csv")

train.shape, test.shape

((28765, 3), (3197, 2))

In [83]:
train.head(3)

Unnamed: 0,id,chirp,label
0,25758,@user bihday greg t,0
1,12137,looks like le monsieur bob approves of the new...,0
2,29855,"#cavycorners, hooded_cavy #f4f rss: hooded_cavy",0


In [84]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = ['chirp']
target_col = 'label'

X, y = train[features], train[target_col]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)
X_test = test[features]

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=30000, ngram_range=(1, 2))

X_train_tfidf = vectorizer.fit_transform(X_train['chirp'])
X_valid_tfidf = vectorizer.transform(X_valid['chirp'])
X_test_tfidf = vectorizer.transform(X_test['chirp'])

In [86]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train_tfidf, y_train)

In [87]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict_proba(X_valid_tfidf)[:, 1]

score = roc_auc_score(y_valid, y_pred)

print(f'Score: {score:.5f}')

Score: 0.95315


In [88]:
y_pred = model.predict_proba(X_test_tfidf)[:, 1]

sids, dpids, answers = [], [], []

for i, row in test.iterrows():
    sids.append(1)
    dpids.append(row['id'])
    answers.append(len(row['chirp']))

    sids.append(2)
    dpids.append(row['id'])
    answers.append(row['chirp'].count('#'))

    sids.append(3)
    dpids.append(row['id'])
    answers.append(y_pred[i])

subm = pd.DataFrame({
    'subtaskID': sids,
    'datapointID': dpids, 
    'answer': answers
})

subm.to_csv("submission.csv", index=False)
subm.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,12413,38.0
1,2,12413,2.0
2,3,12413,0.004042
3,1,9312,55.0
4,2,9312,2.0
