In [22]:
import pandas as pd
import os
from tokenize_tweets import tokenize_tweets_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.svm import SVC
from sklearn_model import testSKLearnModel


In [23]:
datasets_folder = "/Users/alalousis/PycharmProjects/AI_detector/datasets"
results_folder = "/Users/alalousis/PycharmProjects/AI_detector/results"
real_tweets_filename = "real_dataset.csv"
ai_tweets_filename = "ai_dataset_10000.csv"

cols = [9]
real_tweets_df = pd.read_csv(os.path.join(datasets_folder, real_tweets_filename), encoding="UTF-8", header=0, usecols=cols)
real_tweets_df["created_by"] = "human"
real_tweets_df.head()
real_tweets_df.info()

cols = [1]
ai_tweets_df = pd.read_csv(os.path.join(datasets_folder, ai_tweets_filename), encoding="UTF-8", header=0, usecols=cols)
ai_tweets_df["created_by"] = "bot"
ai_tweets_df.head()
ai_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238646 entries, 0 to 238645
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   text        238646 non-null  object
 1   created_by  238646 non-null  object
dtypes: object(2)
memory usage: 3.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        10000 non-null  object
 1   created_by  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [24]:
main_dataset = pd.concat([real_tweets_df[:8000], ai_tweets_df[:2000]], ignore_index=True)

In [25]:
main_dataset

Unnamed: 0,text,created_by
0,US says seizes tanker used to evade North Kore...,human
1,Is America the greatest country? Part 1 drops ...,human
2,O’Reilly predicts THIS is What DOOMS Biden’s r...,human
3,Corruption in Politics\n#blog #politics \nhttp...,human
4,@kylegriffin1 Are you getting all the info on ...,human
...,...,...
9995,"""The future of our country belongs to its chil...",bot
9996,"""It's time for real change! As your leader, I ...",bot
9997,"""Let's put people over politics! As your repre...",bot
9998,"""Protecting our nation's infrastructure means ...",bot


In [26]:
main_dataset = main_dataset.sample(frac=1).reset_index(drop=True)

In [27]:
main_dataset

Unnamed: 0,text,created_by
0,US Government Subsidies Boost the Expected Pro...,human
1,Talking Headways Podcast: No More Transit Hung...,human
2,https://t.co/UcoFsOGkwb🏴‍☠️ NHS app offers Cov...,human
3,"""Enough is enough! Our communities deserve bet...",bot
4,"""Time for a change! Let's prioritize education...",bot
...,...,...
9995,"We can split of amicably, or some of these idi...",human
9996,"""We need a new era of leadership that puts peo...",bot
9997,Fewer infections mean fewer chances to mutate....,human
9998,Via @RawStory: DC cop reveals the FOP cop unio...,human


In [28]:
from sklearn.model_selection import train_test_split

train_dataset, temp_dataset = train_test_split(main_dataset, test_size=0.3, stratify=main_dataset['created_by'], random_state=42)
val_dataset, test_dataset = train_test_split(temp_dataset, test_size=0.5, stratify=temp_dataset['created_by'], random_state=42)

In [29]:
train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

In [30]:
display(train_dataset)
display(val_dataset)
display(test_dataset)

Unnamed: 0,text,created_by
0,"""Time for change! As your representative, I'll...",bot
1,Such comments cannot be tolerated \nPersonal a...,human
2,#News #Politics: Austin on a strategic mission...,human
3,"Neil Chatterjee, a departing member of #FERC, ...",human
4,"""Today, we need a government that puts people ...",bot
...,...,...
6995,If the U.S. does not get on board with doing s...,human
6996,so here is reality.\n#ClimateEmergency #Climat...,human
6997,Basavaraj Bommai is the second non-RSS BJP CM ...,human
6998,75 percent of new Covid cases in Singapore are...,human


Unnamed: 0,text,created_by
0,@HillaryWarnedUs I hope knot 🪢\n\nBut corporat...,human
1,"""Investing in our nation's future means invest...",bot
2,"""Climate change is not just an environmental i...",bot
3,Robin DiAngelo Says Comedy Gives Whites ‘An Ex...,human
4,APC reaffirms July 31 for nationwide ward cong...,human
...,...,...
1495,"""Let's put people over politics! Our community...",bot
1496,"Mike Gonzalez: Critical race theory, Team Bide...",human
1497,Motorola Edge 20 Pro gets portrayed in officia...,human
1498,Germany imposes stricter entry rules for arriv...,human


Unnamed: 0,text,created_by
0,Congress Fears U.S. Intelligence Leaks in Saud...,human
1,If you are called Bhakt or Liberandu even once...,human
2,"""A couple of weeks ago, KP Oli revived the par...",human
3,CRT represents Revolt of the Occupational Ruli...,human
4,The China Model Will Never Work in Iran \n\nFo...,human
...,...,...
1495,#Leftists are out to rename all sorts things o...,human
1496,A Divided Peru Inaugurates a New President \n\...,human
1497,Amy Stillman/Bloomberg -‘Shameful' Moody's dec...,human
1498,Mercari! Vintage Yiddish Birthday Card https:/...,human


In [31]:
train_dataset_tokenized = tokenize_tweets_dataset(train_dataset)
val_dataset_tokenized = tokenize_tweets_dataset(val_dataset)
test_dataset_tokenized = tokenize_tweets_dataset(test_dataset)

In [32]:
display(train_dataset_tokenized)
display(val_dataset_tokenized)
display(test_dataset_tokenized)

Unnamed: 0,text,created_by,text_tokenized
0,"""Time for change! As your representative, I'll...",bot,"["", time, for, change, !, as, your, representa..."
1,Such comments cannot be tolerated \nPersonal a...,human,"[such, comments, cannot, be, tolerated, person..."
2,#News #Politics: Austin on a strategic mission...,human,"[:, austin, on, a, strategic, mission, in, se,..."
3,"Neil Chatterjee, a departing member of #FERC, ...",human,"[neil, chatterjee, ,, a, departing, member, of..."
4,"""Today, we need a government that puts people ...",bot,"["", today, ,, we, need, a, government, that, p..."
...,...,...,...
6995,If the U.S. does not get on board with doing s...,human,"[if, the, u, ., s, ., does, not, get, on, boar..."
6996,so here is reality.\n#ClimateEmergency #Climat...,human,"[so, here, is, reality, ., __user_mention__, (..."
6997,Basavaraj Bommai is the second non-RSS BJP CM ...,human,"[basavaraj, bommai, is, the, second, non-rss, ..."
6998,75 percent of new Covid cases in Singapore are...,human,"[75, percent, of, new, covid, cases, in, singa..."


Unnamed: 0,text,created_by,text_tokenized
0,@HillaryWarnedUs I hope knot 🪢\n\nBut corporat...,human,"[__user_mention__, i, hope, knot, 🪢, but, corp..."
1,"""Investing in our nation's future means invest...",bot,"["", investing, in, our, nation's, future, mean..."
2,"""Climate change is not just an environmental i...",bot,"["", climate, change, is, not, just, an, enviro..."
3,Robin DiAngelo Says Comedy Gives Whites ‘An Ex...,human,"[robin, diangelo, says, comedy, gives, whites,..."
4,APC reaffirms July 31 for nationwide ward cong...,human,"[apc, reaffirms, july, 31, for, nationwide, wa..."
...,...,...,...
1495,"""Let's put people over politics! Our community...",bot,"["", let's, put, people, over, politics, !, our..."
1496,"Mike Gonzalez: Critical race theory, Team Bide...",human,"[mike, gonzalez, :, critical, race, theory, ,,..."
1497,Motorola Edge 20 Pro gets portrayed in officia...,human,"[motorola, edge, 20, pro, gets, portrayed, in,..."
1498,Germany imposes stricter entry rules for arriv...,human,"[germany, imposes, stricter, entry, rules, for..."


Unnamed: 0,text,created_by,text_tokenized
0,Congress Fears U.S. Intelligence Leaks in Saud...,human,"[congress, fears, u, ., s, ., intelligence, le..."
1,If you are called Bhakt or Liberandu even once...,human,"[if, you, are, called, bhakt, or, liberandu, e..."
2,"""A couple of weeks ago, KP Oli revived the par...",human,"["", a, couple, of, weeks, ago, ,, kp, oli, rev..."
3,CRT represents Revolt of the Occupational Ruli...,human,"[crt, represents, revolt, of, the, occupationa..."
4,The China Model Will Never Work in Iran \n\nFo...,human,"[the, china, model, will, never, work, in, ira..."
...,...,...,...
1495,#Leftists are out to rename all sorts things o...,human,"[are, out, to, rename, all, sorts, things, or,..."
1496,A Divided Peru Inaugurates a New President \n\...,human,"[a, divided, peru, inaugurates, a, new, presid..."
1497,Amy Stillman/Bloomberg -‘Shameful' Moody's dec...,human,"[amy, stillman, /, bloomberg, -, ‘, shameful, ..."
1498,Mercari! Vintage Yiddish Birthday Card https:/...,human,"[mercari, !, vintage, yiddish, birthday, card,..."


In [33]:
X_train = train_dataset_tokenized["text_tokenized"].tolist()
X_train = [" ".join(i) for i in X_train]

X_val = val_dataset_tokenized["text_tokenized"].tolist()
X_val = [" ".join(i) for i in X_val]

X_test = test_dataset_tokenized["text_tokenized"].tolist()
X_test = [" ".join(i) for i in X_test]

dictLabels = {"human":0, "bot":1}
dictLabelsReverse = {0:"human", 1: "bot"}

y_train = train_dataset_tokenized["created_by"].apply(lambda x: dictLabels[x])
y_val = val_dataset_tokenized["created_by"].apply(lambda x: dictLabels[x])
y_test = test_dataset_tokenized["created_by"].apply(lambda x: dictLabels[x])

In [34]:
vect = TfidfVectorizer(ngram_range=(1, 1), max_features=25000, dtype=np.float32)
train_features = vect.fit_transform(X_train)
val_features = vect.transform(X_val)
test_features = vect.transform(X_test)

In [35]:
print(train_features[0])

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 23 stored elements and shape (1, 12914)>
  Coords	Values
  (0, 11579)	0.1549300253391266
  (0, 4900)	0.20889954268932343
  (0, 2414)	0.1870126873254776
  (0, 1249)	0.18590430915355682
  (0, 12795)	0.19758766889572144
  (0, 9727)	0.30679216980934143
  (0, 7010)	0.2698703110218048
  (0, 4749)	0.26119378209114075
  (0, 828)	0.21364359557628632
  (0, 5557)	0.1906144618988037
  (0, 9311)	0.21345254778862
  (0, 4110)	0.1800186038017273
  (0, 1040)	0.11653444170951843
  (0, 11052)	0.31789669394493103
  (0, 4090)	0.19913633167743683
  (0, 11466)	0.15765005350112915
  (0, 1654)	0.30093079805374146
  (0, 938)	0.14110144972801208
  (0, 6887)	0.13424521684646606
  (0, 2078)	0.18418070673942566
  (0, 2010)	0.1888391077518463
  (0, 5078)	0.14771471917629242
  (0, 11605)	0.17211146652698517


In [36]:
val_features

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 26148 stored elements and shape (1500, 12914)>

In [37]:
test_features

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 26388 stored elements and shape (1500, 12914)>

In [38]:
first_vector_tfidfvectorizer = train_features[0]

dfVec = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=vect.get_feature_names_out(), columns=["tfidf"])
dfVec.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
strong,0.317897
representative,0.306792
benefits,0.300931
ll,0.269870
fight,0.261194
...,...
entitlement,0.000000
entrance,0.000000
entrepreneur,0.000000
entrepreneurs,0.000000


In [39]:
train_labels = y_train.tolist()
val_labels = y_val.tolist()
test_labels = y_test.tolist()

In [40]:

# Test SVC.
tuned_parameters = [#{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                    #  'C': [1, 10, 100, 1000]},
                      {'kernel': ['linear'], 'C': [1]}]
model = SVC()
y_pred_svc, best_params_svc = testSKLearnModel(model, tuned_parameters, train_features, val_features, test_features, train_labels, val_labels, test_labels)

Model tested: <class 'sklearn.svm._classes.SVC'>
Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

0.998 (+/-0.000) for {'C': 1, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0      1.000     1.000     1.000      1200
           1      1.000     1.000     1.000       300

    accuracy                          1.000      1500
   macro avg      1.000     1.000     1.000      1500
weighted avg      1.000     1.000     1.000      1500


y_pred: [0 0 0 ... 0 0 0]
y_true: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

In [41]:
test_dataset["created_by"]

0       human
1       human
2       human
3       human
4       human
        ...  
1495    human
1496    human
1497    human
1498    human
1499    human
Name: created_by, Length: 1500, dtype: object

In [42]:
import pandas as pd

predictionLabels = [dictLabelsReverse[t] for t in y_pred_svc]
dfResults = pd.DataFrame(predictionLabels, columns=["prediction"])
dfResults["created_by"] = test_dataset[["created_by"]]
# dfResults["class_type"] = dfTestDataset[["class_type"]]
file_name = results_folder+"/svc_bow.csv"
dfResults.to_csv(file_name, sep='\t', encoding='utf-8')