In [None]:
!pip install ijson tqdm


Collecting ijson
  Downloading ijson-3.4.0.post0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (23 kB)
Downloading ijson-3.4.0.post0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (149 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/149.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.0/149.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.4.0.post0


In [None]:
import ijson
import pandas as pd
from tqdm import tqdm

rows = []

with open("/user.json", "r", encoding="utf-8") as f:
    users = ijson.items(f, "item")

    for user in tqdm(users, desc="Reading users"):
        metrics = user.get("public_metrics", {})

        rows.append({
            "id": user.get("id"),
            "followers_count": metrics.get("followers_count", 0),
            "following_count": metrics.get("following_count", 0),
            "statuses_count": metrics.get("tweet_count", 0),
            "verified": int(user.get("verified", False))
        })

        if len(rows) >= 100000:   # safe in Colab
            break

df_users = pd.DataFrame(rows)
df_users.head()



Reading users: 99999it [00:01, 55015.33it/s]


Unnamed: 0,id,followers_count,following_count,statuses_count,verified
0,u1217628182611927040,7316,215,3098,0
1,u2664730894,123,1090,1823,0
2,u1266703520205549568,3,62,66,0
3,u1089159225148882949,350,577,237,0
4,u36741729,240,297,3713,0


In [None]:
df_users.to_csv("twibot22_behavioral_colab.csv", index=False)
print("Saved twibot22_behavioral_colab.csv")


Saved twibot22_behavioral_colab.csv


In [None]:
import pandas as pd

labels = pd.read_csv("/label.csv")
labels["label"] = labels["label"].map({"bot": 1, "human": 0})

df = df_users.merge(labels, on="id", how="inner")

print("Final dataset shape:", df.shape)
df.head()


Final dataset shape: (100000, 6)


Unnamed: 0,id,followers_count,following_count,statuses_count,verified,label
0,u1217628182611927040,7316,215,3098,0,0
1,u2664730894,123,1090,1823,0,0
2,u1266703520205549568,3,62,66,0,0
3,u1089159225148882949,350,577,237,0,0
4,u36741729,240,297,3713,0,1


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = df[["followers_count", "following_count", "statuses_count", "verified"]]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      0.99      0.96     18686
           1       0.32      0.07      0.12      1314

    accuracy                           0.93     20000
   macro avg       0.63      0.53      0.54     20000
weighted avg       0.90      0.93      0.91     20000



In [None]:
import pandas as pd

custom_user = pd.DataFrame([{
    "followers_count": 10,
    "following_count": 2000,
    "statuses_count": 5000,
    "verified": 0
}])

pred = model.predict(custom_user)

print("Prediction:", "BOT" if pred[0] == 1 else "HUMAN")


Prediction: HUMAN
