In [1]:
!pip install torch transformers pandas
!pip install Keras-Preprocessing

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import confusion_matrix
import os

In [3]:
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import train_test_split

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/My Drive/Dataset/NLP/my_model/').to(device)
tokenizer = AutoTokenizer.from_pretrained('/content/drive/My Drive/Dataset/NLP/my_model/')

In [9]:
# file_path_05 = '/content/drive/My Drive/Dataset/NLP/05_List_11_dedup.csv'
# file_path_08 = '/content/drive/My Drive/Dataset/NLP/08_Dict_22_dedup.csv'
file_path_human = '/content/drive/My Drive/Dataset/NLP/submissions.csv'
# file_path_ai = '/content/drive/My Drive/Dataset/NLP/generated_submissions.csv'

In [10]:
os.listdir('/content/drive/My Drive/Dataset/NLP')

['05_List_11_dedup.csv',
 '08_Dict_22_dedup.csv',
 'generated_submissions.csv',
 'submissions.csv',
 'my_model',
 'SnifferTrain.ipynb',
 'SnifferCall.ipynb']

In [11]:
# df_05 = pd.read_csv(file_path_05)
# df_08 = pd.read_csv(file_path_08)
df_human = pd.read_csv(file_path_human)
# df_ai = pd.read_csv(file_path_ai)

In [12]:
df_human['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
human,4047
unknown,1718


In [13]:
df_human = df_human[df_human['type'] == 'unknown']
df_human

Unnamed: 0,id,author,problem_id,code,submission_id,score,time,type
5,1804043,6630031121,05_List_11,"a=input()\nx=""0123456789""\nk=[0,1,2,3,4,5,6,7,...",1804043,100.0,over 1 year ago,unknown
6,2015946,6730088021,05_List_11,d = input()\ncounts = [0]*10\nfor c in d :\n ...,2015946,100.0,7 months ago,unknown
17,1971879,6731313421,05_List_11,strg = input()\nused = []\nfor i in range(10):...,1971879,100.0,7 months ago,unknown
23,1916726,6630135821,05_List_11,"x=input()\ny=""""\ni=0\nwhile i<10:\n if str(...",1916726,100.0,about 1 year ago,unknown
29,2017300,6730013021,05_List_11,"n = [""0"",""1"",""2"",""3"",""4"",""5"",""6"",""7"",""8"",""9""]\...",2017300,100.0,7 months ago,unknown
...,...,...,...,...,...,...,...,...
5712,1915994,6630337221,08_Dict_22,n=int(input())\nprice={}\ntotal=0\nfor i in ra...,1915994,100.0,about 1 year ago,unknown
5713,1918828,6632026421,08_Dict_22,N = int(input())\nicecream = {}\nfor i in rang...,1918828,100.0,about 1 year ago,unknown
5731,1845730,6632218621,08_Dict_22,ice={}\nfor i in range(int(input())):\n x=i...,1845730,100.0,over 1 year ago,unknown
5734,1825827,6632015521,08_Dict_22,num1 = int(input())\nn1 = {}\nfor i in range(n...,1825827,100.0,over 1 year ago,unknown


In [18]:
from tqdm import tqdm

In [24]:
predictions = []

for code in tqdm(df_human['code'], desc="Predicting"):
    # Tokenize
    inputs = tokenizer.encode_plus(
        code,
        return_tensors='pt',        # ให้ได้ tensor
        truncation=True,
        padding='max_length',
        max_length=512
    )
    inputs = inputs.to(device)

    with torch.no_grad():  # ปิด gradient เพื่อไม่ให้ใช้ memory เกิน
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        predictions.append(predicted_class)

# เพิ่มคอลัมน์ prediction ใน df_human
df_human['predicted'] = predictions

Predicting: 100%|██████████| 1718/1718 [00:50<00:00, 33.80it/s]


In [25]:
label_map = {0: 'human', 1: 'generated'}
df_human['predicted_label'] = df_human['predicted'].map(label_map)

In [26]:
summary = df_human.groupby('problem_id')['predicted'].value_counts().unstack(fill_value=0)
summary.columns = ['human', 'generated']

In [27]:
print(summary)

            human  generated
problem_id                  
05_List_11   1295          9
08_Dict_22    411          3
