# Import libraries

In [1]:
import os
import datetime
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
import pickle
import datetime

# Run script to select our tweet features and write to the tweet_features file

Note that the features we are using are
- tweet id
- user id
- user account age
- favourites count
- hashtag count
- text length

In [2]:
import datetime
import json
import os

from tool.TwitterOriginal import Tweet

start_date = "20201117"
end_date = "20210521"
start_date_datetime = datetime.datetime.strptime(start_date, "%Y%m%d")
end_date_datetime = datetime.datetime.strptime(end_date, "%Y%m%d")
proc_date = start_date_datetime
duration = 300  # t

data_check_list = os.listdir("Data/")
data_check_dic = {i: 1 for i in data_check_list}

for _ in range(duration):
    # process the data in this date
    proc_date_str = proc_date.strftime("%Y-%m-%d")

    input_data_folder_path = "Data/" + proc_date_str + "/"
    output_data_folder_path = "Tmp/" + proc_date_str + "/"

    if not proc_date_str in data_check_dic.keys():
        proc_date = proc_date + datetime.timedelta(days=1)
        if proc_date == end_date_datetime:
            break
        continue

    if not os.path.exists(output_data_folder_path):
        os.makedirs(output_data_folder_path)

    output_data_file = output_data_folder_path + "tweet_feature"
    with open(output_data_file, "w", encoding="utf-8") as file_out:

        for filename in os.listdir(input_data_folder_path):
            input_data_path = input_data_folder_path + filename

            with open(
                input_data_path, "r", encoding="utf-8", errors="ignore"
            ) as file_in:

                for line in file_in:
                    try:
                        tweet = json.loads(line)
                        tweet_obj = Tweet(tweet)
                    except:
                        print(tweet)
                        continue

                    if not tweet_obj.is_en():
                        continue

                    tweet_id = tweet_obj.get_id()
                    user_id = tweet_obj.user.id_str
                    feature_list = tweet_obj.get_tweet_features()
                    file_out.write(tweet_id + "\t" + user_id + "\t")

                    for f in feature_list:
                        file_out.write(str(f))
                        file_out.write("\t")
                    file_out.write("\n")
                    file_out.flush()

    proc_date = proc_date + datetime.timedelta(days=1)
    if proc_date == end_date_datetime:
        break


# Create our training and testing data sets

In [3]:
start_date = "20201117"
end_date = "20210520"
start_date_datetime = datetime.datetime.strptime(start_date, "%Y%m%d")
end_date_datetime = datetime.datetime.strptime(end_date, "%Y%m%d")
proc_date = start_date_datetime
duration = 300  # t

data_check_list = os.listdir("Data/")
data_check_dic = {i: 1 for i in data_check_list}


dataset_X = []
dataset_y = []

for _ in range(duration):
    # process the data in this date
    proc_date_str = proc_date.strftime("%Y-%m-%d")

    input_data_tmp_path = "Tmp/" + proc_date_str + "/"
    input_data_label_path = "Label/" + proc_date_str + "/"
    output_data_folder_path = "Label/All/"
    output_model_folder_path = "Model/"

    if not proc_date_str in data_check_dic.keys():
        proc_date = proc_date + datetime.timedelta(days=1)
        if proc_date == end_date_datetime:
            break
        continue

    if not os.path.exists(output_data_folder_path):
        os.makedirs(output_data_folder_path)

    if not os.path.exists(output_model_folder_path):
        os.makedirs(output_model_folder_path)

    input_data_feature = input_data_tmp_path + "tweet_feature"
    input_data_label = input_data_label_path + "labeled_tweets.txt"

    label_dic = {}

    with open(
        input_data_label, "r", encoding="utf-8", errors="ignore"
    ) as file_label_in:
        for label_line in file_label_in:
            label_line_split = label_line.strip().split("\t")
            label_tweet_id = label_line_split[0]
            label_dic[label_tweet_id] = int(label_line_split[-1])

    with open(
        input_data_feature, "r", encoding="utf-8", errors="ignore"
    ) as file_feature_in:
        for feature_line in file_feature_in:
            feature_line_split = feature_line.strip().split("\t")
            feature_tweet_id = feature_line_split[0]
            if feature_tweet_id in label_dic.keys():
                dataset_X.append([float(i) for i in feature_line_split[2:-1]])
                dataset_y.append(label_dic[feature_tweet_id])
    #     print(label_dic)

    proc_date = proc_date + datetime.timedelta(days=1)
    if proc_date == end_date_datetime:
        break

# SVM

In [28]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [29]:
# Split data
dataset_X = np.array(dataset_X)
dataset_y = np.array(dataset_y)

scaler = MinMaxScaler()
dataset_X = scaler.fit_transform(dataset_X)

data_train, data_test, labels_train, labels_test = train_test_split(
    dataset_X, dataset_y, test_size=0.20, random_state=42
)

In [30]:
# Initialize the SVM classifier
model = SVC()

# Fit the classifier to your data
model.fit(data_train, labels_train)

y_pred = model.predict(data_test)
print(y_pred)

# Calculate and print the training accuracy
train_accuracy = model.score(data_train, labels_train)
print("Training accuracy:", train_accuracy)

# Calculate test accuracy
test_accuracy = model.score(data_test, labels_test)
print("Test accuracy:", test_accuracy)

# Save the trained SVM model
save_model_path = output_model_folder_path + "svm_model.p"
pickle.dump(model, open(save_model_path, "wb"))

# Save the dataset
save_data_path = output_data_folder_path + "dataset.p"
pickle.dump([data_train, labels_train], open(save_data_path, "wb"))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Training accuracy: 0.7169811320754716
Test accuracy: 0.7634408602150538


In [7]:
output_data_folder_path = "Label/All/"
output_model_folder_path = "Model/"

load_model_path = output_model_folder_path + "svm_model.p"
loaded_model = pickle.load(open(load_model_path, "rb"))

load_data_path = output_data_folder_path + "dataset.p"
dataset_X, dataset_y = pickle.load(open(load_data_path, "rb"))

result = loaded_model.score(data_test, labels_test)

print(result)

0.7948717948717948


In [8]:
print("Test Precision Score:", precision_score(labels_test, y_pred, average=None, zero_division=1))
print("Test Recall Score:", recall_score(labels_test, y_pred, average=None))

Test Precision Score: [0.79487179 1.        ]
Test Recall Score: [1. 0.]


In [9]:
print(classification_report(labels_test, y_pred, target_names=['not spam', 'spam'], zero_division=np.nan))

              precision    recall  f1-score   support

    not spam       0.79      1.00      0.89        93
        spam        nan      0.00      0.00        24

    accuracy                           0.79       117
   macro avg       0.79      0.50      0.44       117
weighted avg       0.79      0.79      0.70       117



# Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [11]:
clf = LogisticRegression(random_state=42).fit(data_train, labels_train)
y_pred = clf.predict(data_test)

In [12]:
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [13]:
print(accuracy_score(labels_test, y_pred))

0.7948717948717948


In [14]:
# Calculate and print the training accuracy
train_accuracy = clf.score(data_train, labels_train)
print("Training accuracy:", train_accuracy)

# Calculate test accuracy
test_accuracy = clf.score(data_test, labels_test)
print("Test accuracy:", test_accuracy)

Training accuracy: 0.7241379310344828
Test accuracy: 0.7948717948717948


In [15]:
print(classification_report(labels_test, y_pred, target_names=['not spam', 'spam'], zero_division=np.nan))

              precision    recall  f1-score   support

    not spam       0.79      1.00      0.89        93
        spam        nan      0.00      0.00        24

    accuracy                           0.79       117
   macro avg       0.79      0.50      0.44       117
weighted avg       0.79      0.79      0.70       117



# Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(max_depth=5, random_state=0)
rfc.fit(data_train, labels_train)
y_pred = rfc.predict(data_test)

In [18]:
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [19]:
print(accuracy_score(labels_test, y_pred))

0.7948717948717948


In [20]:
# Calculate and print the training accuracy
train_accuracy = rfc.score(data_train, labels_train)
print("Training accuracy:", train_accuracy)

# Calculate test accuracy
test_accuracy = rfc.score(data_test, labels_test)
print("Test accuracy:", test_accuracy)

Training accuracy: 0.7607758620689655
Test accuracy: 0.7948717948717948


In [21]:
print(classification_report(labels_test, y_pred, target_names=['not spam', 'spam'], zero_division=np.nan))

              precision    recall  f1-score   support

    not spam       0.79      1.00      0.89        93
        spam        nan      0.00      0.00        24

    accuracy                           0.79       117
   macro avg       0.79      0.50      0.44       117
weighted avg       0.79      0.79      0.70       117



# Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(data_train, labels_train)
y_pred = dtc.predict(data_test)

In [24]:
print(y_pred)

[1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [25]:
print(accuracy_score(labels_test, y_pred))

0.6581196581196581


In [26]:
# Calculate and print the training accuracy
train_accuracy = dtc.score(data_train, labels_train)
print("Training accuracy:", train_accuracy)

# Calculate test accuracy
test_accuracy = dtc.score(data_test, labels_test)
print("Test accuracy:", test_accuracy)

Training accuracy: 1.0
Test accuracy: 0.6581196581196581


In [27]:
print(classification_report(labels_test, y_pred, target_names=['not spam', 'spam'], zero_division=np.nan))

              precision    recall  f1-score   support

    not spam       0.80      0.76      0.78        93
        spam       0.21      0.25      0.23        24

    accuracy                           0.66       117
   macro avg       0.51      0.51      0.51       117
weighted avg       0.68      0.66      0.67       117

