In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alessandrasala79/ai-vs-human-generated-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/alessandrasala79/ai-vs-human-generated-dataset?dataset_version_number=4...


100%|██████████| 9.76G/9.76G [01:40<00:00, 104MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4


In [None]:
# path = "/kaggle/input/ai-vs-human-generated-dataset"

In [None]:
import os
import numpy as np
from numpy import linalg as LA
import h5py
import pandas as pd
from tqdm import tqdm
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input


class extract_features:
    def __init__(self):
        self.model = ResNet50(weights = 'imagenet',
              input_shape = ((224, 224, 3)),
              pooling = 'avg',
              include_top = False)


    def image2features(self, img_path):
        img = image.load_img(img_path, target_size=(224, 224))
        img = image.img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        feature = self.model.predict(img, verbose=0)
        normalized_feature = feature[0]/LA.norm(feature[0])
        return normalized_feature


    def extract_and_save_features(self, image_path, label_path, h5f):

        # Read labels
        labels_file = pd.read_csv(label_path)
        filenames = np.array(labels_file['file_name'].values)
        labels = labels_file['label'].values

        feature_array = []
        label_array = []
        image_full_file_paths = os.listdir(image_path)

        for i in tqdm(range(len(image_full_file_paths))):
            idx = np.where(filenames == f'train_data/{image_full_file_paths[i]}')
            label_array.append(labels[idx][0])
            feature_array.append(self.image2features(os.path.join(image_path,image_full_file_paths[i])))

        feature_array = np.array(feature_array)
        label_array = np.array(label_array)

        print("Writing Features")
        try:
            # h5f = h5py.File(os.path.join(save_location, 'efficientNetB7_features.h5'), 'w')
            h5f.create_dataset('features', data=feature_array)
            h5f.create_dataset('labels', data=label_array)
            h5f.close()
        except:
            print("An exception occurred")
        return feature_array, label_array



if __name__ == "__main__":
    save_location = "AI-vs.-Human-Generated-Images-Kaggle-Competition"
    os.makedirs(save_location, exist_ok=True)
    h5f = h5py.File(os.path.join(save_location, 'ResNet50_features.h5'), 'w')
    image_path = f"{path}/train_data"
    label_path = f"{path}/train.csv"
    efficientNet_feature_extractor = extract_features()
    feature_array, label_array = efficientNet_feature_extractor.extract_and_save_features(image_path, label_path, h5f)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


 96%|█████████▌| 76869/79950 [1:47:43<03:59, 12.86it/s]

In [None]:
feature_array.shape

In [None]:

import numpy as np
import h5py
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd

h5file = h5py.File("/content/AI-vs.-Human-Generated-Images-Kaggle-Competition/ResNet50_features.h5",'r')
features = h5file['features'][:]
labels = h5file['labels'][:]
h5file.close()

# apply pca and vizualize
pca = PCA(n_components=3)
features_pca = pca.fit_transform(features)

# visualize
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(features_pca[:, 0], features_pca[:, 1], features_pca[:, 2],
            c=labels, alpha=0.5)
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(features_pca[:, 0], features_pca[:, 1], features_pca[:, 2],
            c=labels, alpha=0.5)
plt.savefig('feature.svg')

In [None]:
# from IPython.display import FileLink
# FileLink('/content/AI-vs.-Human-Generated-Images-Kaggle-Competition/vgg16_features.h5')

In [None]:
# from pydrive2.auth import GoogleAuth
# from pydrive2.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

In [None]:
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

In [None]:
# from google.colab import auth
# auth.authenticate_user()
# from googleapiclient.discovery import build
# drive_service = build('drive', 'v3')

In [None]:
# !zip -r features.zip /content/AI-vs.-Human-Generated-Images-Kaggle-Competition/vgg16_features.h5

  adding: kaggle/working/AI-vs.-Human-Generated-Images-Kaggle-Competition/efficientNetB7_features.h5 (deflated 8%)


In [None]:
# from IPython.display import FileLink
# FileLink('/kaggle/working/features.zip')

In [None]:
# from googleapiclient.http import MediaFileUpload

# file_metadata = {
#   'name': 'Vgg16_Features.zip',
#   'mimeType': '*/*'
# }
# media = MediaFileUpload('/kaggle/working/features.zip',
#                         mimetype='*/*',
#                         resumable=True)
# created = drive_service.files().create(body=file_metadata,
#                                        media_body=media,
#                                        fields='id').execute()
# print('File ID: {}'.format(created.get('id')))

File ID: 1yHlBWjHzsHC4CGUK90LGnZM7wUeAUhXq


In [None]:
# from googleapiclient.http import MediaFileUpload

# file_metadata = {
#   'name': 'Vgg16_Features.h5',
#   'mimeType': '*/*'
# }
# media = MediaFileUpload('/kaggle/working/AI-vs.-Human-Generated-Images-Kaggle-Competition/vgg16_features.h5',
#                         mimetype='*/*',
#                         resumable=True)
# created = drive_service.files().create(body=file_metadata,
#                                        media_body=media,
#                                        fields='id').execute()
# print('File ID: {}'.format(created.get('id')))

File ID: 1b086EwHZrTVb6_LLtFBSKfkXYlfj1RXZ
