In [1]:
import numpy as np
import pandas as pd
from textblob import TextBlob
import glob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [4]:
all_files = glob.glob("./dataset_cleaning/*.pkl")
all_files.sort()
test_filename= "./dataset_cleaning/dataset4Cleaned.pkl"
test2_filename= "./dataset_cleaning/dataset7Cleaned.pkl"

li = []

for filename in all_files:
    
    if(filename != test_filename and filename != test2_filename):

        df = pd.read_pickle(filename)
        li.append(df)

train = pd.concat(li, axis=0, ignore_index=True)
test = pd.read_pickle(test_filename)

print("Number of train features: {} \nNumber of test features: {}".format(len(train), len(test)))

Number of train features: 57348 
Number of test features: 7929


In [5]:
test.columns

Index(['id', 'Informativeness', 'text', 'positive_score', 'negative_score',
       'emotional_devergence_score'],
      dtype='object')

In [6]:
authors=pd.read_csv('./dataset_cleaning/tj/parsed/tweet_metadata.csv')
users=pd.read_csv('./dataset_cleaning/tj/parsed/twitter_user.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
train=pd.merge(train, authors, on='id')
test=pd.merge(test, authors, on='id')
train.drop(train.columns.difference(['id', 'Informativeness', 'text', 'author_id', 'tweet_type','positive_score', 'negative_score',
       'emotional_devergence_score' ]), 1, inplace=True)
test.drop(test.columns.difference(['id', 'Informativeness', 'text', 'author_id', 'tweet_type','positive_score', 'negative_score',
       'emotional_devergence_score' ]), 1, inplace=True)

In [8]:
users.drop(columns=['created_at', 'lang', 'name', 'screen_name', 'location','access'], inplace=True)
users.columns=['author_id', 'has_description', 'bio_has_url', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'statuses_count', 'protected',
       'verified', 'default_profile', 'default_profile_image']

In [9]:
train=pd.merge(train, users, on='author_id')
test=pd.merge(test, users, on='author_id')

In [10]:
train.head()

Unnamed: 0,id,Informativeness,text,positive_score,negative_score,emotional_devergence_score,author_id,tweet_type,has_description,bio_has_url,followers_count,friends_count,favourites_count,listed_count,statuses_count,protected,verified,default_profile,default_profile_image
0,211040709124440064,0,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,1,-1,0.2,601864285,tweet,Traveling http://goo.gl/97yT2,,62,19,0,0.0,402,False,False,True,True
1,210864180218167296,0,Get in on the fun every Thursday with the @csi...,2,-1,0.3,245545247,tweet,Director of Marketing & Promotions for the Col...,http://www.csindy.com/colorado/Home,454,35,2,28.0,3327,False,False,False,False
2,211157222699433985,0,Welcome to our newest STUDENTathlete- Reagan B...,2,-1,0.3,487854217,tweet,,,1,1,0,0.0,422,False,False,True,True
3,211162553659830272,0,Denver Post: #Colorado governor signs bill cre...,1,-1,0.2,17602802,tweet,The Reporters Committee for Freedom of the Pre...,http://www.rcfp.org,1965,188,0,181.0,4392,False,False,False,False
4,211216962162933761,0,Pretty sure I'm going to live in Manitou Sprin...,3,-1,0.4,348495572,tweet,"free-spirited, ever thinking, world citizen",,227,203,23,2.0,3687,False,False,True,False


In [11]:

def text_processing(tweet):
    
    tweet= tweet.lower()
    
    #Removing hyperlinks from the tweet
    tweet_no_links=re.sub(r'http\S+', '', tweet)
    
    #Generating the list of words in the tweet (hashtags and other punctuations removed)
    def form_sentence(tweet):
        tweet_blob = TextBlob(tweet)
        return ' '.join(tweet_blob.words)
    new_tweet = form_sentence(tweet_no_links)
    
    #Removing stopwords and words with unusual symbols
    def no_user_alpha(tweet):
        tweet_list = [ele for ele in tweet.split() if ele != 'user']
        clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word for word in clean_s.split() if word not in stopwords.words('english')]
        return clean_mess
    no_punc_tweet = no_user_alpha(new_tweet)
    
    #Normalizing the words in tweets 
    def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return " ".join(normalized_tweet)
    
    
    return normalization(no_punc_tweet)

In [12]:
def count_hashtags(text):
    return text.count('#')

In [13]:
def link_present(text):
    return int('http' in text) 

In [14]:
import emoji
def text_has_emoji(text):
    return any(map(text[:-50].__contains__, [':)',':(',':P',':c',':<','c:','<:',':L',':l','^_^','^.^','>_<','>.<','>_>','<_<','>.>','<.<','-.-','-_-','o_o','o.o','._.','owo','OwO',';_;','>:)',':]',':}','>:(','>:|','-.^','-_^','8)','B)','<3','xD',':3','x3','\o','\o/',';_;','OwO','uwu','O:)',':#',':*']))

In [15]:
def has_instructional_words(text):
    return any(map(text.__contains__, ["text", 'call', 'donate']))

In [16]:
def has_phone_number(text):
    return bool(re.search('\\d{3}-\\d{3}-\\d{4}', text))

In [17]:
def text_has_slang(text):
    for i in text[:-50].split(' '):
        for j in ['afaik','afk','asl','atm','atw','ayy','bae','bb','bbiab','bbl','bbs','bc','bf','bff','bork','brb','btw','cba','convo','cp','cya','cya','dank','dc','dem','dw','e2e','fml','FOMO','FTFY','ftl','ftw','fwiw','fyi','g2g','g4u','gf','gg','goml','gr8','gratz','gtfo','guiz','hbu','hru','ianadb','ianalb','ianap','idc','idgaf','idk','iirc','ik','ikr','ily','inb4','irl','jfc','jk','js','k','kappa','kek','kms','kthx','l8r','leet','lmao','lmk','lol','LPT','lrl','lrn2','m8','maga','mfw','mrw','nerf','ngl','nm','nmu','noob','nvm','ofc','omf','omg','omw','ooc','op','OP','orly','pepe','pleb','pleb','plz','pron','pwned','REEEEEE','rekt','rickrol','rip','rly','rms','rofl','rotflol','rtfm','rude','shank','smd','smh','soz','swag','tbf','tbh','tbt','TIFU','tf','tfw','thx','tide','TIL','tl;dr','tmw','tolo','topkek','ty','uwotm8','w00t','wb','wot','wtb','wtf','wtg','wts','wuu2','yarly','ymmv','yolo','yw']:
            if(i==j):
                return True
    return False

In [18]:
def is_RT(text):
    for i in text[:-50].split(' '):
        if(i == 'RT'):
            return True
    return False

In [19]:
def has_profanity(text):
    for i in text[:-50].split(' '):
        for j in ['acrotomophilia','anal','anilingus','anus','arsehole','ass','asshole','assmunch','auto erotic','autoerotic','babeland','baby batter','ball gag','ball gravy','ball kicking','ball licking','ball sack','ball sucking','bangbros','bareback','barely legal','barenaked','bastardo','bastinado','bbw','bdsm','beaver cleaver','beaver lips','bestiality','bi curious','big black','big breasts','big knockers','big tits','bimbos','birdlock','bitch','black cock','blonde action','blonde on blonde action','blow j','blow your l','blue waffle','blumpkin','bollocks','bondage','boner','boob','boobs','booty call','brown showers','brunette action','bukkake','bulldyke','bullet vibe','bung hole','bunghole','busty','butt','buttcheeks','butthole','camel toe','camgirl','camslut','camwhore','carpet muncher','carpetmuncher','chocolate rosebuds','circlejerk','cleveland steamer','clit','clitoris','clover clamps','clusterfuck','cock','cocks','coprolagnia','coprophilia','cornhole','cum','cumming','cunnilingus','cunt','darkie','date rape','daterape','deep throat','deepthroat','dick','dildo','dirty pillows','dirty sanchez','dog style','doggie style','doggiestyle','doggy style','doggystyle','dolcett','domination','dominatrix','dommes','donkey punch','double dong','double penetration','dp action','eat my ass','ecchi','ejaculation','erotic','erotism','escort','ethical slut','eunuch','faggot','fecal','felch','fellatio','feltch','female squirting','femdom','figging','fingering','fisting','foot fetish','footjob','frotting','fuck','fucking','fuck buttons','fudge packer','fudgepacker','futanari','g-spot','gang bang','gay sex','genitals','giant cock','girl on','girl on top','girls gone wild','goatcx','goatse','gokkun','golden shower','goo girl','goodpoop','goregasm','grope','group sex','guro','hand job','handjob','hard core','hardcore','hentai','homoerotic','honkey','hooker','hot chick','how to kill','how to murder','huge fat','humping','incest','intercourse','jack off','jail bait','jailbait','jerk off','jigaboo','jiggaboo','jiggerboo','jizz','juggs','kike','kinbaku','kinkster','kinky','knobbing','leather restraint','leather straight jacket','lemon party','lolita','lovemaking','make me come','male squirting','masturbate','menage a trois','milf','missionary position','motherfucker','mound of venus','mr hands','muff diver','muffdiving','nambla','nawashi','negro','neonazi','nig nog','nigga','nigger','nimphomania','nipple','nipples','nsfw images','nude','nudity','nympho','nymphomania','octopussy','omorashi','one cup two girls','one guy one jar','orgasm','orgy','paedophile','panties','panty','pedobear','pedophile','pegging','penis','phone sex','piece of shit','piss pig','pissing','pisspig','playboy','pleasure chest','pole smoker','ponyplay','poof','poop chute','poopchute','porn','porno','pornography','prince albert piercing','pthc','pubes','pussy','queaf','raghead','raging boner','rape','raping','rapist','rectum','reverse cowgirl','rimjob','rimming','rosy palm','rosy palm and her 5 sisters','rusty trombone','s&m','sadism','scat','schlong','scissoring','semen','sex','sexo','sexy','shaved beaver','shaved pussy','shemale','shibari','shit','shota','shrimping','slanteye','slut','smut','snatch','snowballing','sodomize','sodomy','spic','spooge','spread legs','strap on','strapon','strappado','strip club','style doggy','suck','sucks','suicide girls','sultry women','swastika','swinger','tainted love','taste my','tea bagging','threesome','throating','tied up','tight white','tit','tits','titties','titty','tongue in a','topless','tosser','towelhead','tranny','tribadism','tub girl','tubgirl','tushy','twat','twink','twinkie','two girls one cup','undressing','upskirt','urethra play','urophilia','vagina','venus mound','vibrator','violet blue','violet wand','vorarephilia','voyeur','vulva','wank','wet dream','wetback','white power','women rapping','wrapping men','wrinkled starfish','xx','xxx','yaoi','yellow showers','yiffy','zoophilia']:
            if(i==j):
                return True
    return False


In [20]:
def one_word_sentance(text):
    return int(len(text.split(" "))<=3)

In [21]:
train["hashtag_count"]=train['text'].apply(count_hashtags)
test["hashtag_count"]=test['text'].apply(count_hashtags)

In [22]:
train["hashtag_present"] = train["hashtag_count"].apply(lambda x: np.sign(x))
test["hashtag_present"] = test["hashtag_count"].apply(lambda x: np.sign(x))

In [23]:
train["link_present"]=train['text'].apply(link_present)
test["link_present"]=test['text'].apply(link_present)

In [24]:
train["emoji_present"] = train['text'].apply(text_has_emoji)
test["emoji_present"] = test['text'].apply(text_has_emoji)

In [25]:
train["instructinal_keyword"] = train['text'].apply(has_instructional_words)
test["instructinal_keyword"] = test['text'].apply(has_instructional_words)

In [26]:
train["contains_phone_number"] = train['text'].apply(has_phone_number)
test["contains_phone_number"] = test['text'].apply(has_phone_number)

In [27]:
train["slang_present"] = train['text'].apply(text_has_slang)
test["slang_present"] = test['text'].apply(text_has_slang)

In [28]:
train["is_RT"] = train['text'].apply(is_RT)
test["is_RT"] = test['text'].apply(is_RT)

In [29]:
train["has_profanity"] = train['text'].apply(has_profanity)
test["has_profanity"] = test['text'].apply(has_profanity)

In [30]:
train["one_word_sentance"] = train['text'].apply(one_word_sentance)
test["one_word_sentance"] = test['text'].apply(one_word_sentance)

In [31]:
# train = train.apply(get_sentiment_score, axis=1)
# test = test.apply(get_sentiment_score, axis=1)

In [32]:
# train["emotional_devergence_score"]=(train["positive_score"]-train["negative_score"])/10.0
# test["emotional_devergence_score"]=(test["positive_score"]-test["negative_score"])/10.0

In [33]:
test.head()

Unnamed: 0,id,Informativeness,text,positive_score,negative_score,emotional_devergence_score,author_id,tweet_type,has_description,bio_has_url,...,hashtag_count,hashtag_present,link_present,emoji_present,instructinal_keyword,contains_phone_number,slang_present,is_RT,has_profanity,one_word_sentance
0,1241490299215634434,1,Official death toll from #covid19 in the Unite...,3,-3,0.6,449864075,tweet,Irish & European. Push button interests: #Brex...,,...,1,1,0,False,False,False,False,False,False,0
1,1245916400981381130,1,"Dearest Mr. President @USER 1,169 coronavirus ...",3,-3,0.6,724994029281116160,tweet,pemerhati sosial dan bersahabat dengan kemanus...,,...,1,1,0,False,False,False,False,False,False,0
2,1241132432402849793,1,Latest Updates March 20 ⚠️5274 new cases and 3...,1,-3,0.4,1190410214954151936,tweet,Fuck every name that ever existed. Uses https...,,...,0,0,0,False,False,False,False,False,False,0
3,1241170177997357057,1,Latest Updates March 21 ⚠️5725 new cases and 5...,1,-3,0.4,1190410214954151936,tweet,Fuck every name that ever existed. Uses https...,,...,0,0,0,False,False,False,False,False,False,0
4,1241782965476212737,1,Latest Updates March 22 ➡️5560 new cases and 6...,3,-3,0.6,1190410214954151936,tweet,Fuck every name that ever existed. Uses https...,,...,0,0,0,False,False,False,False,False,False,0


In [34]:
['id',  'author_id', 'tweet_type','has_description', 'bio_has_url','protected', 'default_profile','default_profile_image']

['id',
 'author_id',
 'tweet_type',
 'has_description',
 'bio_has_url',
 'protected',
 'default_profile',
 'default_profile_image']

In [35]:
train= train.drop(columns=['id',  'author_id', 'tweet_type','has_description', 'bio_has_url','protected', 'default_profile','default_profile_image'])
test= test.drop(columns=['id',  'author_id', 'tweet_type','has_description', 'bio_has_url','protected', 'default_profile','default_profile_image'])

In [36]:
from tqdm import tqdm
tqdm.pandas()

train['text']=train['text'].progress_apply(text_processing)
test['text']=test['text'].progress_apply(text_processing)


  from pandas import Panel
100%|██████████| 46025/46025 [02:54<00:00, 263.57it/s]
100%|██████████| 6955/6955 [00:55<00:00, 125.75it/s]


In [37]:
pd.to_pickle(train, './word2vec_data/train.pkl')
pd.to_pickle(test, './word2vec_data/test.pkl')

In [38]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [39]:
texts=train.text
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(texts)
sequences_train = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 39774 unique tokens.


In [40]:
X_train = pad_sequences(sequences_train)
y_train = np.asarray(train.Informativeness)
print('Shape of X train :', X_train.shape)
print('Shape of label train :', y_train.shape)

Shape of X train : (46025, 26)
Shape of label train : (46025,)


In [41]:
max(train.text.apply(lambda x: len(x.split(' '))))

26

In [42]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('./word2vec_data/GoogleNews-vectors-negative300.bin', binary=True)

EMBEDDING_DIM=300
vocabulary_size=len(word_index)+1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [43]:
embedding_matrix.shape

(39775, 300)

In [50]:
train.shape

(46025, 21)

In [51]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
sequence_length = X_train.shape[1]

filter_sizes = [3,4,5]
num_filters = 150
drop = 0.5



inputs = Input(shape=(sequence_length,))

#CHANGE THIS!
meta_input = Input(shape=(19,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)


conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
merged_tensor2 = concatenate([dropout, meta_input])

output = Dense(units=2, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(merged_tensor2)



model = Model([inputs , meta_input], output)

In [52]:
y_train = y_train.reshape(len(y_train), 1)

In [53]:
y_train_extra = np.abs(y_train-1)

In [55]:
X_train_meta = train.to_numpy()[:, 2:].astype('float32')

In [56]:
X_train_meta.shape

(46025, 19)

In [67]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['AUC'])
callbacks = [EarlyStopping(monitor='loss')]
model.fit([X_train, X_train_meta], np.append(y_train, y_train_extra, axis=1), batch_size=100, epochs=10, verbose=1,
         callbacks=callbacks)  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<tensorflow.python.keras.callbacks.History at 0x1b26940f90>

In [68]:
sequences_test=tokenizer.texts_to_sequences(test.text)
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
X_test_meta = test.to_numpy()[:, 2:].astype('float32')
y_pred=model.predict([X_test, X_test_meta])

In [75]:
y_pred_bool =(y_pred > 0.5).astype("int32")

In [76]:
y_true = np.asarray(test.Informativeness).reshape(len(test),1)

In [77]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
print("Acc: {}, f1: {}, roc: {}".format(accuracy_score(y_true, y_pred_bool[:,0]),f1_score(y_true, y_pred_bool[:,0]),roc_auc_score(y_true, y_pred_bool[:,0])))


Acc: 0.5529834651329979, f1: 0.5879390324718357, roc: 0.5595209238886529
