<a href="https://colab.research.google.com/github/ananyaa-04/fake-news/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kaggle

In [None]:
# Upload kaggle.json
from google.colab import files
files.upload()  # Upload the kaggle.json file

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ananyaa0219","key":"21468b22047139b9cf42328a98b03d93"}'}

In [None]:
# Setup kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d alessandrasala79/ai-vs-human-generated-dataset

Dataset URL: https://www.kaggle.com/datasets/alessandrasala79/ai-vs-human-generated-dataset
License(s): apache-2.0
^C


In [None]:
import zipfile
import os

# Unzip downloaded file
with zipfile.ZipFile("ai-vs-human-generated-dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("ai_vs_human")

FileNotFoundError: [Errno 2] No such file or directory: 'ai-vs-human-generated-dataset.zip'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [None]:
train_df = pd.read_csv("ai_human_dataset/train.csv")
test_df = pd.read_csv("ai_human_dataset/test.csv")

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
df = shuffle(df, random_state=42)

In [None]:
df['label'] = df['label'].astype(str)

In [None]:
df['file_name'] = df['file_name'].apply(lambda x: f"/content/ai_human_dataset/{x}")

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

In [None]:
# Drop rows where the label is NaN
train_df = train_df.dropna(subset=['label'])

# Convert labels to string format
train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)
test_df['label'] = test_df['label'].astype(str)

In [None]:
train_df = train_df[train_df['label'].notna()].copy()
val_df = val_df[val_df['label'].notna()].copy()
test_df = test_df[test_df['label'].notna()].copy()

In [None]:
# ✅ 1. Drop rows with missing or non-finite labels
train_df = train_df[pd.to_numeric(train_df['label'], errors='coerce').notnull()].copy()
val_df = val_df[pd.to_numeric(val_df['label'], errors='coerce').notnull()].copy()
test_df = test_df[pd.to_numeric(test_df['label'], errors='coerce').notnull()].copy()

# ✅ 2. Convert labels to integer strings (i.e., '0' or '1')
train_df['label'] = train_df['label'].astype(float).astype(int).astype(str)
val_df['label'] = val_df['label'].astype(float).astype(int).astype(str)
test_df['label'] = test_df['label'].astype(float).astype(int).astype(str)

# ✅ 3. Confirm cleanup
print("Train labels:", train_df['label'].unique())
print("Val labels:", val_df['label'].unique())
print("Test labels:", test_df['label'].unique())

In [None]:
print("Train labels:", train_df['label'].unique())
print("Validation labels:", val_df['label'].unique())
print("Test labels:", test_df['label'].unique())

In [None]:
!pip install tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    rotation_range=20,
    zoom_range=0.2,
    shear_range=0.15,
    width_shift_range=0.1,
    height_shift_range=0.1
)

val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_gen = train_datagen.flow_from_dataframe(
    train_df, x_col='file_name', y_col='label',
    target_size=(224, 224), batch_size=32,
    class_mode='binary'
)

val_gen = val_datagen.flow_from_dataframe(
    val_df, x_col='file_name', y_col='label',
    target_size=(224, 224), batch_size=32,
    class_mode='binary'
)

test_gen = test_datagen.flow_from_dataframe(
    test_df, x_col='file_name', y_col='label',
    target_size=(224, 224), batch_size=32,
    class_mode='binary', shuffle=False
)

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.optimizers import Adam

In [None]:
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = True

In [None]:
for layer in base_model.layers[:-10]:
    layer.trainable = False

# Add custom top
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid', dtype='float32')(x)

model = Model(inputs=base_model.input, outputs=output)

In [None]:
model.compile(optimizer=Adam(learning_rate=1e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
]

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=10,
    callbacks=callbacks
)

In [None]:
loss, acc = model.evaluate(test_gen)
print(f"Test Accuracy: {acc:.4f}, Test Loss: {loss:.4f}")