In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0192/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0192/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0956/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0956/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0266/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0266/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0435/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0435/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_1054/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_1054/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0664/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0664/fil

In [2]:
# Load the training data CSV
train_df = pd.read_csv("/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv")

# Show the top rows
train_df.head()

Unnamed: 0,id,real_text_id
0,0,1
1,1,2
2,2,1
3,3,2
4,4,2


In [3]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split # Import for splitting data
from sklearn.metrics import accuracy_score # Import for checking accuracy

# --- Step 1: Load Test Data ---
test_base_path = "/kaggle/input/fake-or-real-the-impostor-hunt/data/test"
test_data = [] 

print("Loading test data...")
for article_folder in sorted(os.listdir(test_base_path)):
    article_path = os.path.join(test_base_path, article_folder)
    
    if os.path.isdir(article_path):
        with open(os.path.join(article_path, "file_1.txt")) as f1, open(os.path.join(article_path, "file_2.txt")) as f2:
            file1 = f1.read()
            file2 = f2.read()

        article_id = int(article_folder.split('_')[1])
        
        test_data.append({
            "id": article_id,
            "file_1": file1,
            "file_2": file2,
        })

test_df = pd.DataFrame(test_data)
print("Test data loaded successfully.")


# --- Step 2: Load and Prepare Training Text ---
train_texts_1 = []
train_texts_2 = []

print("Loading training texts...")
for index, row in train_df.iterrows():
    folder_path = f"/kaggle/input/fake-or-real-the-impostor-hunt/data/train/article_{row['id']:04d}/"
    with open(folder_path + "file_1.txt") as f1, open(folder_path + "file_2.txt") as f2:
        train_texts_1.append(f1.read())
        train_texts_2.append(f2.read())


# --- Step 3: Feature Engineering (The "Difference" Method) ---
print("Creating features...")
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))

all_texts = train_texts_1 + train_texts_2 + test_df['file_1'].tolist() + test_df['file_2'].tolist()
vectorizer.fit(all_texts)

X_train_1 = vectorizer.transform(train_texts_1)
X_train_2 = vectorizer.transform(train_texts_2)
X_test_1 = vectorizer.transform(test_df['file_1'])
X_test_2 = vectorizer.transform(test_df['file_2'])

X = X_train_1 - X_train_2 # We'll call this 'X' for the full training set
y = train_df['real_text_id']
X_test = X_test_1 - X_test_2
print("Features created successfully.")


# --- Step 4: Validate Model Performance ---
# Before we train on all our data, we'll split it to see how well our model
# performs on data it hasn't seen before. This gives us a reliable score.
# --------------------------------------------------------------------------
print("Validating model...")

# Split our full training data (X, y) into a training part (80%) and a validation part (20%).
# - test_size=0.2 means 20% for validation.
# - random_state=42 ensures the split is the same every time.
# - stratify=y ensures the proportion of 1s and 2s is the same in the train and validation sets.
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize the model just for this validation test.
validation_model = LogisticRegression(C=0.1, solver='liblinear', random_state=42)

# Train the model ONLY on the smaller training split.
validation_model.fit(X_train_split, y_train_split)

# Make predictions on the validation set (the data the model hasn't seen).
val_preds = validation_model.predict(X_val)

# Compare the model's predictions to the true answers to get an accuracy score.
accuracy = accuracy_score(y_val, val_preds)
print(f"✅ Validation Accuracy: {accuracy:.4f}")


# --- Step 5: Final Model Training & Submission ---
# Now that we know our approach works, we train a new model on ALL of the training data
# to make it as smart as possible before making our final predictions.
# ------------------------------------------------------------------------------------
print("Training final model on all data...")
final_model = LogisticRegression(C=0.1, solver='liblinear', random_state=42)
final_model.fit(X, y) # Notice we use the full X and y here.
print("Model trained successfully.")

print("Making final predictions and saving submission file...")
test_predictions = final_model.predict(X_test)
submission_df = pd.DataFrame({'id': test_df['id'], 'real_text_id': test_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\n✅ Submission.csv created successfully!")
submission_df.head()

Loading test data...
Test data loaded successfully.
Loading training texts...
Creating features...
Features created successfully.
Validating model...
✅ Validation Accuracy: 0.7368
Training final model on all data...
Model trained successfully.
Making final predictions and saving submission file...

✅ Submission.csv created successfully!


Unnamed: 0,id,real_text_id
0,0,2
1,1,2
2,2,1
3,3,1
4,4,2
