In [1]:
# Libraries
import re
import glob
import numpy as np
import pandas as pd
import datetime as dt
from tensorflow.keras import models, preprocessing #, layers, callbacks

In [2]:
# get all .csv files from path
# files with tweets collected with Twitter API
path = 'data/clean_tweets/'
files_path = path + '*.csv'
files = glob.glob(files_path)

In [3]:
files

['data/clean_tweets\\clean_tweets_2021-11-21.csv',
 'data/clean_tweets\\clean_tweets_2021-11-22.csv',
 'data/clean_tweets\\clean_tweets_2021-11-23.csv',
 'data/clean_tweets\\clean_tweets_2021-11-24.csv',
 'data/clean_tweets\\clean_tweets_2021-11-25.csv',
 'data/clean_tweets\\clean_tweets_2021-11-26.csv',
 'data/clean_tweets\\clean_tweets_2021-11-27.csv',
 'data/clean_tweets\\clean_tweets_2021-11-28.csv',
 'data/clean_tweets\\clean_tweets_2021-11-29.csv',
 'data/clean_tweets\\clean_tweets_2021-11-30.csv',
 'data/clean_tweets\\clean_tweets_2021-12-01.csv']

In [4]:
# empty dataframe
tweets_api = pd.DataFrame()

# loop over files
for file in files:
    
    # store file in a temporary dataframe
    df = pd.read_csv(file)
    
    # append df into main dataframe
    tweets_api = tweets_api.append(df)
    
# from int to str
tweets_api["tweet_id"] = tweets_api["tweet_id"].astype(str)

# reset index
tweets_api = tweets_api.reset_index(drop = True)
print(tweets_api.shape)
tweets_api.head(2)

(8830, 7)


Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet
0,2021-11-21 20:17:03+00:00,Symptomatic COVID-19 testing should be free an...,en,1466139300404375576,False,,"['free', 'covid', 'drug', 'safe', 'ontarians',..."
1,2021-11-21 20:17:03+00:00,https://t.co/e4JoW7aQGx\nVaccines failed to st...,en,1466139299993301003,False,,"['people', '4', '60+', '89', '3', '16', 'covid..."


In [5]:
# Clean column
# remove [], commas and "".
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: x.lstrip("["))
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: x.rstrip("]"))
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: x.split(","))
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: [s.replace("'","") for s in x])
tweets_api["clean_tweet"] = tweets_api.clean_tweet.apply(lambda x: [s.replace('"',"") for s in x])
tweets_api.head(2)

Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet
0,2021-11-21 20:17:03+00:00,Symptomatic COVID-19 testing should be free an...,en,1466139300404375576,False,,"[free, covid, drug, safe, ontarians, 19, ..."
1,2021-11-21 20:17:03+00:00,https://t.co/e4JoW7aQGx\nVaccines failed to st...,en,1466139299993301003,False,,"[people, 4, 60+, 89, 3, 16, covid, 7, ..."


In [6]:
# Create class object.
tokenizer = preprocessing.text.Tokenizer(num_words = 500000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                                               lower = True)

# Updates internal vocabulary based on a list of texts.
# In our case, since texts contains lists, it is assumed each entry of the lists to be a token.
tokenizer.fit_on_texts(tweets_api['clean_tweet'].values)

In [7]:
X = tokenizer.texts_to_sequences(tweets_api["clean_tweet"].values)
X = preprocessing.sequence.pad_sequences(X, maxlen = 392)

In [8]:
# Load model that was previously trained
keras_model = models.load_model("trained_model/tf_keras_model.h5")

In [9]:
keras_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 392, 10)           5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 392, 10)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 5,044,501
Trainable params: 5,044,501
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Predict
y_pred_scores = keras_model.predict(X)
y_pred = np.round(y_pred_scores).astype(int)

In [11]:
# Create columns with the result from the model
tweets_api["toxic"] = y_pred
tweets_api["pred_scores"] = y_pred_scores
print(tweets_api.shape)
tweets_api.head(2)

(8830, 9)


Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet,toxic,pred_scores
0,2021-11-21 20:17:03+00:00,Symptomatic COVID-19 testing should be free an...,en,1466139300404375576,False,,"[free, covid, drug, safe, ontarians, 19, ...",0,0.201645
1,2021-11-21 20:17:03+00:00,https://t.co/e4JoW7aQGx\nVaccines failed to st...,en,1466139299993301003,False,,"[people, 4, 60+, 89, 3, 16, covid, 7, ...",0,0.006692


In [12]:
# find toxic tweets
tweets_api[tweets_api["toxic"] == 1]

Unnamed: 0,created_at,text,lang,tweet_id,possibly_sensitive,type,clean_tweet,toxic,pred_scores
3,2021-11-21 20:17:03+00:00,An Israeli doctor says he believes he caught t...,en,1466139299263533060,False,,"[people, say, belief, doctor, london, var...",1,0.895560
6,2021-11-21 20:17:03+00:00,meu deus eu preciso fazer processos de emenda ...,pt,1466139299137695747,False,,"[de, emenda, meu, pra, e, fazer, prefeit...",1,0.998193
9,2021-11-21 20:17:02+00:00,Omarion Covid strand just hit the US. First ca...,en,1466139298239901701,False,,"[hit, first, case, omarion, cali, covid, ...",1,0.673810
11,2021-11-21 20:17:02+00:00,"Camas mal ocupadas, espero que pronto queden l...",es,1466139297569021961,False,,"[camas, queden, que, madrid, una, miembro...",1,0.630690
23,2021-11-21 20:17:01+00:00,@thearsenalhelen @jazz_sian COVID itself has a...,en,1466139291013160963,False,replied_to,"[cardiac, likelihood, treatment, sian, rec...",1,0.931584
...,...,...,...,...,...,...,...,...,...
8798,2021-12-01 06:16:43+00:00,◤全球大流行◢在抗议现场，还有理发师为未接种疫苗的抗议者义剪……\nhttps://t.co...,zh,1465927824208891905,False,,"[Xin, Zhou, Yi, Fei, Guan, Zhong, Zai, ...",1,0.943636
8808,2021-12-01 06:16:38+00:00,La justice namuroise constate l'illégalité du ...,fr,1465927800959946753,False,replied_to,"[la, lillegalite, en, appel, ticket, covi...",1,0.887801
8810,2021-12-01 06:16:37+00:00,Schweizer Spitäler müssen dringliche Krebsoper...,de,1465927797893963778,False,,"[patienten, ein, !, wovon, leute, ungeimp...",1,0.976746
8811,2021-12-01 06:16:37+00:00,@HealthMinistryM Our children are wanting to c...,en,1465927797466025986,False,,"[child, take, wanting, please, restricted,...",1,0.903671


In [13]:
# Seems toxic
print(tweets_api.iloc[8060].text)

@dgodin1234 You're free to get and spread covid and get shot in gun violence...🤷‍♀️


In [14]:
print(tweets_api.iloc[8066].text)

Covid documentaries and movies will come, and I hate that xD


In [15]:
tweets_api.to_csv("trained_model/preds/tweets_preds.csv", index = False)