In [1]:
# Libraries
import re
import glob
import numpy as np
import pandas as pd
import datetime as dt
from tensorflow.keras import models, preprocessing #, layers, callbacks

In [2]:
# get all .csv files from path
# files with tweets collected with Twitter API
path = 'data/clean_tweets/'
files_path = path + '*.csv'
files = glob.glob(files_path)

In [3]:
files

['data/clean_tweets\\clean_tweets_2021-11-21.csv',
 'data/clean_tweets\\clean_tweets_2021-11-22.csv',
 'data/clean_tweets\\clean_tweets_2021-11-23.csv',
 'data/clean_tweets\\clean_tweets_2021-11-24.csv',
 'data/clean_tweets\\clean_tweets_2021-11-25.csv',
 'data/clean_tweets\\clean_tweets_2021-11-26.csv',
 'data/clean_tweets\\clean_tweets_2021-11-27.csv',
 'data/clean_tweets\\clean_tweets_2021-11-28.csv',
 'data/clean_tweets\\clean_tweets_2021-11-29.csv',
 'data/clean_tweets\\clean_tweets_2021-11-30.csv']

In [4]:
# empty dataframe
tweets_api = pd.DataFrame()

# loop over files
for file in files:
    
    # store file in a temporary dataframe
    df = pd.read_csv(file)
    
    # append df into main dataframe
    tweets_api = tweets_api.append(df)
    
# from int to str
tweets_api["tweet_id"] = tweets_api["tweet_id"].astype(str)

# reset index
tweets_api = tweets_api.reset_index(drop = True)
print(tweets_api.shape)
tweets_api.head(2)

(8068, 7)


Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet
0,2021-11-21 20:17:03+00:00,Symptomatic COVID-19 testing should be free an...,en,1466139300404375576,False,,"['free', 'covid', 'drug', 'safe', 'ontarians',..."
1,2021-11-21 20:17:03+00:00,https://t.co/e4JoW7aQGx\nVaccines failed to st...,en,1466139299993301003,False,,"['people', '4', '60+', '89', '3', '16', 'covid..."


In [5]:
# Clean column
# remove [], commas and "".
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: x.lstrip("["))
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: x.rstrip("]"))
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: x.split(","))
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: [s.replace("'","") for s in x])
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: [s.replace('"',"") for s in x])
tweets_api.head(2)

Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet
0,2021-11-21 20:17:03+00:00,Symptomatic COVID-19 testing should be free an...,en,1466139300404375576,False,,"[free, covid, drug, safe, ontarians, 19, ..."
1,2021-11-21 20:17:03+00:00,https://t.co/e4JoW7aQGx\nVaccines failed to st...,en,1466139299993301003,False,,"[people, 4, 60+, 89, 3, 16, covid, 7, ..."


In [6]:
# Create class object.
tokenizer = preprocessing.text.Tokenizer(num_words = 500000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                                               lower = True)

# Updates internal vocabulary based on a list of texts.
# In our case, since texts contains lists, it is assumed each entry of the lists to be a token.
tokenizer.fit_on_texts(tweets_api['clean_tweet'].values)

In [7]:
X = tokenizer.texts_to_sequences(tweets_api["clean_tweet"].values)
X = preprocessing.sequence.pad_sequences(X, maxlen = 392)

In [8]:
# Load model that was previously trained
keras_model = models.load_model("trained_model/tf_keras_model.h5")

In [9]:
keras_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 392, 10)           5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 392, 10)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 5,044,501
Trainable params: 5,044,501
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Predict
y_pred_scores = keras_model.predict(X)
y_pred = np.round(y_pred_scores).astype(int)

In [11]:
# Create columns with the result from the model
tweets_api["toxic"] = y_pred
tweets_api["pred_scores"] = y_pred_scores
print(tweets_api.shape)
tweets_api.head(2)

(8068, 9)


Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet,toxic,pred_scores
0,2021-11-21 20:17:03+00:00,Symptomatic COVID-19 testing should be free an...,en,1466139300404375576,False,,"[free, covid, drug, safe, ontarians, 19, ...",0,0.01837
1,2021-11-21 20:17:03+00:00,https://t.co/e4JoW7aQGx\nVaccines failed to st...,en,1466139299993301003,False,,"[people, 4, 60+, 89, 3, 16, covid, 7, ...",0,0.485414


In [12]:
# find toxic tweets
tweets_api[tweets_api["toxic"] == 1]

Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet,toxic,pred_scores
4,2021-11-21 20:17:03+00:00,@maria11terra Porque não? Lar e COVID?,pt,1466139299259330569,False,replied_to,"[lar, porque, nao, ?, e, covid]",1,0.931032
8,2021-11-21 20:17:02+00:00,@larrymagid Follow-up on email: the ultimate e...,en,1466139298386923531,False,,"[dten, device, ready, new, study, demo, ...",1,0.874902
9,2021-11-21 20:17:02+00:00,Omarion Covid strand just hit the US. First ca...,en,1466139298239901701,False,,"[hit, first, case, omarion, cali, covid, ...",1,0.994585
10,2021-11-21 20:17:02+00:00,@RafiPriv I got COVID a year ago and mum was h...,en,1466139297921355782,False,replied_to,"[ago, covid, heal, got, strong, go, year...",1,0.763937
12,2021-11-21 20:17:02+00:00,"@Furchur1 Teď už jen přežít 🐉, získat peníz...",cs,1466139296524640256,False,replied_to,"[penize, info, ted, dan, mezd, 2021, ned...",1,0.690856
...,...,...,...,...,...,...,...,...,...
8021,2021-11-30 21:53:31+00:00,@MedsPatentPool @WHO Proud of India's role in ...,en,1466163575873949700,False,replied_to,"[s, similar, helped, !, indian, pandemic,...",1,0.581741
8033,2021-11-30 21:53:28+00:00,First U.S. omicron patient was fully vaccinate...,en,1466163566663421952,False,,"[official, say, first, omicron, mild, cov...",1,0.599784
8042,2021-11-30 21:53:27+00:00,@MichaelPSenger Covid has brought out people's...,en,1466163560615133193,False,replied_to,"[s, people, true, covid, self, brought]",1,0.897344
8060,2021-11-30 21:53:24+00:00,@dgodin1234 You're free to get and spread covi...,en,1466163549353521159,False,replied_to,"[violence, shot, get, free, covid, gun, ...",1,0.656391


In [13]:
# Seems toxic
print(tweets_api.iloc[8060].text)

@dgodin1234 You're free to get and spread covid and get shot in gun violence...🤷‍♀️


In [14]:
print(tweets_api.iloc[8066].text)

Covid documentaries and movies will come, and I hate that xD


In [15]:
tweets_api.to_csv("trained_model/preds/tweets_preds.csv", index = False)