In [None]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (train_test_split)
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
from sklearn.metrics import precision_score
import warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm, trange


English Model

In [None]:
# Load the human dataset
engdata = pd.read_csv("data/Training_Essay_Data.csv")

In [4]:
engdata

Unnamed: 0,text,generated
0,Car-free cities have become a subject of incre...,1
1,"Car Free Cities Car-free cities, a concept ga...",1
2,A Sustainable Urban Future Car-free cities ...,1
3,Pioneering Sustainable Urban Living In an e...,1
4,The Path to Sustainable Urban Living In an ...,1
...,...,...
29140,There has been a fuss about the Elector Colleg...,0
29141,Limiting car usage has many advantages. Such a...,0
29142,There's a new trend that has been developing f...,0
29143,As we all know cars are a big part of our soci...,0


In [5]:
# Remove missing values
engdata.dropna(inplace=True)

In [6]:
engdata["text"] = engdata["text"].apply(lambda x: x.lower())

In [7]:
label_counts = engdata["generated"].value_counts()
print(label_counts)

generated
0    17508
1    11637
Name: count, dtype: int64


In [8]:
# Separate the majority and minority classes
minority_class = engdata[engdata['generated'] == 1]  # Replace with your minority label
majority_class = engdata[engdata['generated'] == 0]  # Replace with your majority label

In [9]:
# Undersample the majority class to match the size of the minority class
undersampled_majority = majority_class.sample(len(minority_class), random_state=42)

In [10]:
# Combine the minority class with the undersampled majority class
balanced_eng_data = pd.concat([minority_class, undersampled_majority])

In [11]:
# Shuffle the dataset
balanced_eng_data = balanced_eng_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [12]:
print(balanced_eng_data['generated'].value_counts())

generated
0    11637
1    11637
Name: count, dtype: int64


In [13]:
english_vectorizer = TfidfVectorizer(max_features=1000) # Limit features to avoid dimensionality issues
english_features = english_vectorizer.fit_transform(balanced_eng_data["text"])

In [14]:
#Train test split

X_train, X_test, y_train, y_test = train_test_split(english_features, balanced_eng_data["generated"], test_size=0.2, random_state=42)

In [15]:
dummy = DummyClassifier(random_state=123, strategy="most_frequent")

# Fit the dummy model on the training data
dummy.fit(X_train, y_train)

y_dummy_pred = dummy.predict(X_test)

y_dummy_pred_proba = dummy.predict_proba(X_test)[:,1] # Probabilities for the positive class

In [None]:
#Using precision because the cost of false positives (e.g., classifying human-written text as AI-generated) is high.
dummy_precision = precision_score(y_test, y_dummy_pred, average='weighted')
print(f'Precision: {dummy_precision}')

#UndefinedMetricWarning is expected because it only predicts the majority class (e.g., "Human") and never predicts the minority class (e.g., "AI"). As a result, for the minority class ("AI"), precision is undefined because it never predicted it.

Precision: 0.24433960457380982


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
model = RandomForestClassifier(n_estimators=500, max_features='sqrt',  n_jobs=-1, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

y_pred_proba = model.predict_proba(X_test)[:,1] # Probabilities for the positive class


precision = precision_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision}')

Precision: 0.9867496353134677


In [19]:
def test_model_on_input(model, vectorizer):
    # Get user input for English and Chinese text
    user_input = input("Enter text: ")

    # Preprocess inputs
    vector = vectorizer.transform([user_input])  # Vectorize input

    # Predict using the trained model
    prediction = model.predict(vector)

    # Display the result
    label = "Human" if prediction[0] == 0 else "AI"
    print(f"The entered text is classified as: {label}")

In [20]:
# test_model_on_input(model, english_vectorizer)

**Chinese Model**

In [None]:
human_chindata = pd.read_csv('data/cmn.txt', sep="\t", names=["English", "Chinese", "CC"])

In [22]:
human_chindata

Unnamed: 0,English,Chinese,CC
0,Hi.,嗨。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Hi.,你好。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Run.,你用跑的。,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
3,Wait!,等等！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
4,Wait!,等一下！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
...,...,...,...
24355,"Tom didn't know how to translate the word ""com...",汤姆不知如何翻译“计算机”一词，因为同他谈话的人从未见过一台。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
24356,"Even now, I occasionally think I'd like to see...",即使是现在，我偶尔还是想见到你。不是今天的你，而是我记忆中曾经的你。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
24357,It's very easy to sound natural in your own na...,你很容易把母语说得通顺流畅，却很容易把非母语说得不自然。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
24358,"I got fired from the company, but since I have...",虽然我被公司解雇了，但是我还有点存款，所以目前不用担心生计问题。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [23]:
human_chindata = human_chindata["Chinese"]

In [24]:
sampled_human_chindata = human_chindata.sample(n=100, random_state=42)

In [25]:
sampled_human_chin_df= pd.DataFrame(sampled_human_chindata)

In [26]:
sampled_human_chin_df['generated'] = 0

In [27]:
sampled_human_chin_df

Unnamed: 0,Chinese,generated
2009,沒有水。,0
10935,我一直信守我的諾言。,0
21496,他們總是早起，即使是在周日。,0
15805,堅持下來，你就能成功。,0
9578,聖誕節很快就到了，不是嗎？,0
...,...,...
18036,我們的辦公室之間有扇門。,0
3350,他一个人生活。,0
22464,汤姆没有愿意帮助他的朋友。,0
23790,在冬天，很多老人在冰上滑倒。,0


In [None]:
ai_chin_df = pd.read_csv('data/manual_chinese_ai.csv', names=["Chinese"])

In [29]:
ai_chin_df['generated'] = 1

In [30]:
ai_chin_df

Unnamed: 0,Chinese,generated
0,早上六点钟，闹钟响了，开始了新的一天。我起床后会先喝一杯温水，然后去阳台感受清晨的阳光。早餐...,1
1,今天的天气非常晴朗，天空一片蔚蓝。我决定去附近的公园跑步，享受清新的空气和鸟鸣声。公园里有很...,1
2,工作日的早晨总是忙碌的。赶着洗漱、换衣服、整理书包，然后匆匆出门。地铁站挤满了人，每个人都面...,1
3,中午和同事们一起去附近的小餐馆吃饭。今天点了牛肉面和小菜，大家聊了聊最近的项目进展和生活琐事...,1
4,下午的时间似乎总是特别漫长。办公桌上堆满了文件，我一边听着轻音乐，一边完成最后的报告。偶尔还...,1
...,...,...
95,随着城市化进程的加快，宠物饲养逐渐成为都市生活的重要组成部分。然而，宠物与人类共享居住空间，...,1
96,宠物文化在不同国家展现出鲜明的特色。在日本，猫咖啡馆和狗公园随处可见，这些场所不仅为宠物提供...,1
97,养狗的乐趣真的太多了。比如说，每次出门遛狗，它总能找到各种新奇的地方去探险。有一次它竟然把头...,1
98,我家的鸟特别聪明，它会学人说话。有一次我不小心打翻了水杯，它居然学着说：“你又犯错了！”逗得...,1


In [31]:
# Combine the datasets (vertical stack)
combined_chin_df = pd.concat([ai_chin_df, sampled_human_chin_df], ignore_index=True)

# Shuffle the combined dataset
shuffled_combined_chin_df = combined_chin_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [32]:
shuffled_combined_chin_df

Unnamed: 0,Chinese,generated
0,随着城市化进程的加快，宠物饲养逐渐成为都市生活的重要组成部分。然而，宠物与人类共享居住空间，...,1
1,今天我去了花市，买了一盆绿色植物放在办公桌上。它让整个房间显得更加生机勃勃，也让我在工作时感...,1
2,政治的本质在于权力的分配与运行，这是一个社会稳定与发展的关键要素。历史表明，一个公平、公正的...,1
3,为什么您一个人？,0
4,今天晚饭吃什么。,0
...,...,...
195,战争还没结束。,0
196,下午的阳光特别温暖，我坐在窗边喝了一杯咖啡。手边是一张明信片，上面写着来自朋友的问候。这种简...,1
197,仓鼠每次喝水的时候都会把头扬得特别高，像是在表演，超级可爱。,1
198,他很容易觉得累。,0


In [33]:
shuffled_combined_chin_df["Chinese"] = shuffled_combined_chin_df["Chinese"].apply(lambda x: " ".join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\white\AppData\Local\Temp\jieba.cache
Loading model cost 0.602 seconds.
Prefix dict has been built successfully.


In [34]:
chinese_vectorizer = TfidfVectorizer(max_features=1000)
chinese_features = chinese_vectorizer.fit_transform(shuffled_combined_chin_df["Chinese"])

In [35]:
print(shuffled_combined_chin_df['generated'].value_counts())


generated
1    100
0    100
Name: count, dtype: int64


In [36]:
C_X_train, C_X_test, C_y_train, C_y_test = train_test_split(chinese_features, shuffled_combined_chin_df["generated"], test_size=0.2, random_state=42)

In [37]:
dummyC = DummyClassifier(random_state=123)

# Fit the dummy model on the training data
dummyC.fit(C_X_train, C_y_train)

C_y_dummy_pred = dummyC.predict(C_X_test)

In [38]:
C_dummy_precision = precision_score(C_y_test, C_y_dummy_pred, average='weighted')
print(f'Precision: {C_dummy_precision}')

Precision: 0.20249999999999999


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
C_model = RandomForestClassifier(n_estimators=500, max_features='sqrt',  n_jobs=-1, random_state=42, class_weight='balanced')
C_model.fit(C_X_train, C_y_train)

In [40]:
C_y_pred = C_model.predict(C_X_test)


c_precision = precision_score(C_y_test, C_y_pred, average='weighted')
print(f'Precision: {c_precision}')

Precision: 0.8392857142857142


In [41]:
#test_model_on_input(C_model, chinese_vectorizer)

Combined Model

In [42]:
sampled_eng_data = balanced_eng_data.sample(n=200, random_state=42)

In [43]:
sampled_eng_data

Unnamed: 0,text,generated
18669,"thomas jefferson once wrote,"" determine never ...",0
18670,baseball is a beloved sport that has been play...,1
167,using technology to read the emotional express...,0
8373,most people are familiar with the concept of t...,1
4275,the facial action coding sytem is not valuable...,0
...,...,...
22950,can a computer system percieve emotions of hum...,0
8087,"in recent years, many states have passed laws ...",0
16738,texting and driving has been the cause of so m...,0
10163,"phones and driving\n\nhmmm, should teenagers d...",0


In [44]:
shuffled_combined_chin_df['text'] = shuffled_combined_chin_df['Chinese']

In [45]:
combined_df = pd.concat([sampled_eng_data, shuffled_combined_chin_df], ignore_index=True)

In [46]:
combined_df

Unnamed: 0,text,generated,Chinese
0,"thomas jefferson once wrote,"" determine never ...",0,
1,baseball is a beloved sport that has been play...,1,
2,using technology to read the emotional express...,0,
3,most people are familiar with the concept of t...,1,
4,the facial action coding sytem is not valuable...,0,
...,...,...,...
395,战争 还 没 结束 。,0,战争 还 没 结束 。
396,下午 的 阳光 特别 温暖 ， 我 坐在 窗边 喝 了 一杯 咖啡 。 手边 是 一张 明信...,1,下午 的 阳光 特别 温暖 ， 我 坐在 窗边 喝 了 一杯 咖啡 。 手边 是 一张 明信...
397,仓鼠 每次 喝水 的 时候 都 会 把头 扬得 特别 高 ， 像是 在 表演 ， 超级 可爱 。,1,仓鼠 每次 喝水 的 时候 都 会 把头 扬得 特别 高 ， 像是 在 表演 ， 超级 可爱 。
398,他 很 容易 觉得 累 。,0,他 很 容易 觉得 累 。


In [47]:
vectorizer = TfidfVectorizer(max_features=1000)

features = vectorizer.fit_transform(combined_df['text'])

In [48]:
X_train, X_test, y_train, y_test = train_test_split(features, combined_df["generated"], test_size=0.2, random_state=42)

In [49]:
model = RandomForestClassifier(n_estimators=500, max_features='sqrt',  n_jobs=-1, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [50]:
y_pred = model.predict(X_test)

c_precision = precision_score(y_test, y_pred, average='weighted')
print(f'Precision: {c_precision}')

Precision: 0.8765664160401002


In [51]:
#test_model_on_input(model, vectorizer)

***Qualitative Analysis Sample Selection***

In [52]:
# Separate AI and human samples
ai_samples = combined_df[combined_df['generated'] == 1]
human_samples = combined_df[combined_df['generated'] == 0]

In [53]:
# Randomly sample 5 from each
ai_sampled = ai_samples.sample(n=5, random_state=421)
human_sampled = human_samples.sample(n=5, random_state=421)

In [54]:
# Combine and shuffle
combined_sampled = pd.concat([ai_sampled, human_sampled])
shuffled_sampled = combined_sampled.sample(frac=1, random_state=421).reset_index(drop=True)

In [55]:
# Save text samples for analysis
shuffled_sampled[['text']].to_csv('qualitative_analysis_samples.csv', index=False)

In [56]:
# Save true labels for validation
shuffled_sampled[['generated']].to_csv('qualitative_analysis_labels.csv', index=False)

In [57]:
pd.set_option('display.max_rows', 100)  # Adjust the number of rows shown
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)

In [58]:
print(shuffled_sampled['text'].to_string())

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

This was then copied to word to perform the qualitative analysis

In [59]:
#This code was ran after the predictions were written
print(shuffled_sampled['generated'])

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    0
8    0
9    0
Name: generated, dtype: int64
