In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import os


In [None]:

# Automatski postavi putanju do trenutnog direktorijuma
current_dir = os.path.dirname(__file__)  # Direktorijum u kojem se nalazi skripta

# Putanje do fajlova
train_csv_path = os.path.join(current_dir, 'data', 'jigsaw-toxic-comment-train.csv')
test_csv_path = os.path.join(current_dir, 'data', 'test.csv')

# Učitaj trening podatke
train_data = pd.read_csv(train_csv_path)


In [None]:

# Priprema podataka
train_data['comment_text'] = train_data['comment_text'].astype(str)
train_data['toxic'] = train_data['toxic'].astype(int)

# Podjela na trening i validacione podatke
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['comment_text'], train_data['toxic'], test_size=0.2, random_state=42
)


In [None]:

# Pretvorba teksta u numeričke karakteristike pomoću TF-IDF
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)


In [None]:

# Treniranje jednostavnog modela logističke regresije
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

# Evaluacija modela
val_preds = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(val_labels, val_preds)
print(f"Validation ROC-AUC: {roc_auc:.4f}")



In [None]:
# Učitaj test podatke
test_data = pd.read_csv(test_csv_path)
test_data['content'] = test_data['content'].astype(str)

# Pretvorba testnog teksta u numeričke karakteristike
X_test = vectorizer.transform(test_data['content'])

# Predikcija na testnim podacima
test_preds = model.predict_proba(X_test)[:, 1]



In [None]:
# Kreiranje izlaznog fajla
submission = pd.DataFrame({
    'id': test_data['id'],
    'toxic': test_preds
})
submission_path = os.path.join(current_dir, 'submission.csv')
submission.to_csv(submission_path, index=False)

print(f"Submission file saved as {submission_path}")
