In [1]:
# NumPy, TensorFlow, os
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from sklearn import metrics
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
os.listdir()

['twitter_parsed_dataset.csv',
 'twitter_sentiment.ipynb',
 'twitter_racism_parsed_dataset.csv',
 'twitter_sexism_parsed_dataset.csv',
 '.ipynb_checkpoints',
 '.git']

In [3]:
### We aggregate all the data into one dataframe

parsed = pd.read_csv('twitter_parsed_dataset.csv')
racism = pd.read_csv('twitter_racism_parsed_dataset.csv')
sexism = pd.read_csv('twitter_sexism_parsed_dataset.csv')

twitter_data = pd.concat([parsed, racism, sexism]).dropna()
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(twitter_data['Text'], twitter_data['oh_label'], test_size=0.20, random_state=42)

X_train.head()

9327     There is such a diff between reality &amp; wha...
14633    Katie's a fatty!! Model!!!! Hahahaha #MKR #kil...
4197     @Nibelsnarfabarf @srhbutts @GRIMACHU it is rea...
3534     @MaxOfS2D @StephenAtWar Origin is a flaming pi...
4500     No, you don't. @Shut_Up_Jeff: I thought of a r...
Name: Text, dtype: object

In [5]:
print(len(X_train), len(X_test))

36157 9040


In [6]:
y_train.head()

9327     0.0
14633    1.0
4197     0.0
3534     0.0
4500     1.0
Name: oh_label, dtype: float64

In [7]:
vec = TfidfVectorizer()

X_vectrain = vec.fit_transform(X_train)
X_vectest = vec.transform(X_test)

In [8]:
### Baseline accuracy, predicting all of one class.

1 - np.mean(y_test)

0.7609513274336284

In [9]:
### Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clf = LogisticRegression(random_state=0).fit(X_vectrain, y_train)
pred = clf.predict(X_vectest)
print("f1_score = ", metrics.f1_score(y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(y_test, pred))

f1_score =  0.8811389720311346
accuracy =  0.8888274336283186


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
!pip install --upgrade pip
!pip install nltk
!pip install gensim



In [11]:
twitter_data['Text']

0        @halalflaws @biebervalue @greenlinerzjm I read...
1        @ShreyaBafna3 Now you idiots claim that people...
2        RT @Mooseoftorment Call me sexist, but when I ...
3        @g0ssipsquirrelx Wrong, ISIS follows the examp...
4                                   #mkr No No No No No No
                               ...                        
14876    @RaikonL @finaleve @mja333 WHY DO YOU HATE FRE...
14877    It is unconscionable that our regulatory bodie...
14878    @Dartanveerahmad @Janx53 @geehall1 We want ISI...
14879    #mkr  Unbelievable how low Kat &amp; Andre wil...
14880    RT @JamesMakienko: @omeisy @yemenrightsmon Peo...
Name: Text, Length: 45197, dtype: object

In [12]:
### Cleaning tweets

import re

def cleaning_tweets(tweet):
    # 1. Remove Twitter handles (@user)
    users = re.findall("@[\w]*", tweet) # tokenizing
    for user in users:
        tweet = re.sub(user, '', tweet)

    # 2. Remove, Punctuations, Numbers, and Special Characters (keep hashtags)
    tweet = re.sub("[^a-zA-Z#]", " ", tweet)

    # 3. Lowercase all
    tweet = tweet.lower()
    
    # 4. Splitting text into tokens
    tweet = tweet.split()
    
    return tweet


twitter_data['tokenized_tweets'] = twitter_data['Text'].apply(cleaning_tweets)
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label,tokenized_tweets
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0,"[i, read, them, in, context, no, change, in, m..."
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0,"[now, you, idiots, claim, that, people, who, t..."
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0,"[rt, call, me, sexist, but, when, i, go, to, a..."
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,"[wrong, isis, follows, the, example, of, moham..."
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0,"[#mkr, no, no, no, no, no, no]"


In [13]:
twitter_data['tokenized_tweets'].iloc[19]

['woo', 'can', 't', 'wait', 'to', 'see', 'what', 'happens', '#mkr']

In [14]:
twitter_data['cleaned_tweets'] = twitter_data['tokenized_tweets'].apply(lambda x: ' '.join(x))
twitter_data['num_tokens'] = twitter_data['tokenized_tweets'].apply(len)
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label,tokenized_tweets,cleaned_tweets,num_tokens
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0,"[i, read, them, in, context, no, change, in, m...",i read them in context no change in meaning th...,18
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0,"[now, you, idiots, claim, that, people, who, t...",now you idiots claim that people who tried to ...,22
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0,"[rt, call, me, sexist, but, when, i, go, to, a...",rt call me sexist but when i go to an auto pla...,19
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,"[wrong, isis, follows, the, example, of, moham...",wrong isis follows the example of mohammed and...,11
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0,"[#mkr, no, no, no, no, no, no]",#mkr no no no no no no,7


In [15]:
### Word2Vec
from gensim.models import Word2Vec

w2v = Word2Vec(twitter_data['tokenized_tweets'],
               size = 200,
               window = 5,
               min_count = 2,
               sg = 1,
               hs = 0,
               negative = 10,
               workers = 32,
               seed = 1)

w2v

<gensim.models.word2vec.Word2Vec at 0x7f55146bf940>

In [16]:
w2v.most_similar(positive='sexist')

  w2v.most_similar(positive='sexist')


[('swear', 0.7739920020103455),
 ('misandrist', 0.7709363698959351),
 ('females', 0.7590835690498352),
 ('im', 0.7566421031951904),
 ('basketball', 0.734123706817627),
 ('rappers', 0.7287617921829224),
 ('comedians', 0.7186075448989868),
 ('analysts', 0.7161632180213928),
 ('#sexist', 0.7120012044906616),
 ('announcers', 0.7095776796340942)]

In [17]:
w2v.most_similar(positive='racist')

  w2v.most_similar(positive='racist')


[('device', 0.7912424802780151),
 ('bigot', 0.7736752033233643),
 ('fashioned', 0.7657896280288696),
 ('rhetorical', 0.7646394371986389),
 ('unpopular', 0.7619739174842834),
 ('biggot', 0.7582675218582153),
 ('retarded', 0.7559173107147217),
 ('bigoted', 0.7542294859886169),
 ('wnba', 0.7538695931434631),
 ('monger', 0.7505632042884827)]

In [18]:
### Consider tweets that don't produce any tokens, delete those
twitter_data = twitter_data[twitter_data['num_tokens'] > 0 ]
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label,tokenized_tweets,cleaned_tweets,num_tokens
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0,"[i, read, them, in, context, no, change, in, m...",i read them in context no change in meaning th...,18
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0,"[now, you, idiots, claim, that, people, who, t...",now you idiots claim that people who tried to ...,22
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0,"[rt, call, me, sexist, but, when, i, go, to, a...",rt call me sexist but when i go to an auto pla...,19
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,"[wrong, isis, follows, the, example, of, moham...",wrong isis follows the example of mohammed and...,11
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0,"[#mkr, no, no, no, no, no, no]",#mkr no no no no no no,7


In [19]:
w2v[twitter_data['tokenized_tweets'].iloc[0]]

  w2v[twitter_data['tokenized_tweets'].iloc[0]]


array([[-0.08616858,  0.6018554 , -0.0041653 , ...,  0.02202413,
         0.10708536, -0.2065192 ],
       [ 0.08572198,  0.270992  ,  0.08413573, ..., -0.00950772,
         0.05047496, -0.44735977],
       [-0.11475495,  0.6492598 , -0.15602793, ...,  0.2821606 ,
        -0.41180852, -0.042546  ],
       ...,
       [-0.19178945,  0.8881018 , -0.23951456, ..., -0.18243316,
        -0.40385884, -0.81736225],
       [-0.36884183,  1.0916493 , -0.31159085, ..., -0.45013034,
        -0.5191194 , -0.75563383],
       [-0.16900884,  0.57097   , -0.29769507, ..., -0.01783078,
        -0.00949285, -0.24126376]], dtype=float32)

In [20]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += w2v[word].reshape((1, size))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

wordvec_arrays = np.zeros((len(twitter_data['tokenized_tweets']), 200))

for i in range(len(twitter_data['tokenized_tweets'])):
    wordvec_arrays[i, :] = word_vector(twitter_data['tokenized_tweets'].iloc[i], 200)
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df

  vec += w2v[word].reshape((1, size))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.077116,0.529059,-0.032810,0.273359,0.017446,0.228200,-0.003853,0.238130,0.010053,0.231009,...,-0.054935,0.130297,-0.121414,-0.029483,0.006110,-0.022517,-0.156276,-0.139557,0.008158,-0.221789
1,-0.069066,0.358940,0.117288,0.029919,-0.165394,0.119998,-0.010704,0.127601,-0.050857,0.053011,...,-0.178487,0.208656,-0.174308,0.067932,-0.133147,-0.077760,-0.096673,-0.029129,0.038120,0.004285
2,-0.086911,0.418334,0.102821,0.141301,-0.121793,0.245582,-0.090362,0.096808,0.124401,0.156268,...,-0.052106,0.170952,-0.246308,0.074399,-0.147604,0.009140,-0.019963,0.086100,0.101775,-0.117584
3,0.033125,0.441088,0.051957,0.286847,-0.271401,0.176073,0.021853,0.229128,-0.110752,0.010660,...,-0.130451,0.245162,-0.096759,-0.019220,-0.007197,-0.150246,-0.185466,-0.078087,0.135357,0.038776
4,0.110687,0.646871,0.001008,-0.203437,0.182091,0.756588,0.411304,0.454717,-0.177291,0.114346,...,0.318886,0.521368,-0.100999,0.357696,-0.243786,-0.019430,-0.313556,-0.055227,0.111527,-0.253051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45133,-0.176219,0.523891,-0.085382,-0.035549,-0.076849,0.464065,-0.177175,0.231347,-0.112153,0.104145,...,0.027699,0.299437,-0.240835,0.082156,0.067988,-0.137045,-0.149745,0.178143,0.190889,-0.060944
45134,0.030734,0.407065,0.021193,0.055956,-0.191023,0.207260,-0.091704,0.081062,-0.022519,0.052431,...,-0.027022,0.209386,-0.128721,0.135864,-0.036755,-0.064434,-0.065790,-0.029154,0.131945,-0.089151
45135,-0.157921,0.416517,-0.034118,0.076544,-0.160337,0.219227,-0.062522,0.109040,0.011671,0.113055,...,-0.005307,0.202154,-0.080051,0.059469,-0.066018,-0.100507,-0.137580,-0.004005,0.122055,-0.033227
45136,-0.054480,0.432186,-0.050923,0.016114,-0.117701,0.233391,-0.011567,-0.080663,0.135087,0.036834,...,0.059621,0.297346,-0.147487,0.124441,-0.215757,-0.120879,-0.002591,-0.063305,0.048833,-0.192290


In [21]:
w2v_X_train, w2v_X_test, w2v_y_train, w2v_y_test = train_test_split(wordvec_df, twitter_data['oh_label'], test_size=0.20, random_state=42)


In [22]:
### Word2vec w/ Logistic Regression

clf_w2v = LogisticRegression(random_state=0).fit(w2v_X_train, w2v_y_train)
pred = clf_w2v.predict(w2v_X_test)
print("f1_score = ", metrics.f1_score(w2v_y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(w2v_y_test, pred))

f1_score =  0.8076017391535372
accuracy =  0.8243243243243243


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
### Word2vec w/ NN

# Define Neural Network
NN = Sequential()
NN.add(Dense(64,input_shape=(200,)))
NN.add(Dropout(0.2))
NN.add(Activation('relu'))
NN.add(Dense(64))
NN.add(Dropout(0.2))
NN.add(Activation('relu'))
NN.add(Dense(1))
NN.add(Activation('sigmoid'))
NN.summary()
NN.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                12864     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
activation (Activation)      (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6

In [24]:
NN.fit(w2v_X_train, w2v_y_train, batch_size=34, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f55106591c0>

In [25]:
NN_pred = NN.predict_classes(w2v_X_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [26]:
print("f1_score = ", metrics.f1_score(w2v_y_test, NN_pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(w2v_y_test, NN_pred))

f1_score =  0.8450172316307162
accuracy =  0.8481391227292867


In [27]:
vocab_size = len(w2v.wv.vocab)
vocab_size

19645

In [145]:
### Word2vec w/ CNN

# CNN = Sequential()
# embedding_dim = 5
# CNN.add(layers.Input(shape=200,))
# CNN.add(layers.Embedding(vocab_size, embedding_dim, input_length=200))
# CNN.add(layers.Conv1D(128, 5, activation='relu'))
# CNN.add(layers.GlobalMaxPooling1D())
# CNN.add(layers.Dense(10, activation='relu'))
# CNN.add(layers.Dense(1, activation='sigmoid'))
# CNN.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['acc'])
# CNN.summary()

wordids = layers.Input(shape=(200,))
CNN = layers.Embedding(vocab_size,200 , 10, input_length=200)(wordids)
CNN = layers.Conv1D(filters=2, kernel_size=2, activation='relu')(CNN)
CNN = layers.GlobalMaxPooling1D()(CNN)
CNN = layers.Dropout(rate=0.7)(CNN)
CNN = layers.Dense(10, activation='relu')(CNN)
CNN = layers.Dense(2, activation='relu')(CNN)
CNN = layers.Dense(2, activation='relu')(CNN)
prediction = layers.Dense(2, activation='softmax')(CNN)

CNN_model = keras.Model(inputs=wordids, outputs=prediction)
CNN_model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [146]:
CNN_model

<tensorflow.python.keras.engine.functional.Functional at 0x7f93f8b3e2b0>

In [147]:
w2v_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
11832,0.066636,-0.055801,0.104545,0.055636,-0.036905,-0.076465,-0.16681,0.170999,0.043313,0.227367,...,0.343142,0.009389,0.058588,0.007882,0.230042,0.190709,0.15819,0.210799,-0.107453,0.380495
2053,0.058817,-0.033192,0.148476,-0.001522,-0.09115,-0.139191,-0.182401,0.186856,-0.156304,0.218057,...,0.1945,0.085747,-0.047964,0.023323,0.214617,0.227429,0.121006,0.339526,-0.002729,0.380185
12650,0.134369,-0.065956,0.039868,0.067017,0.038138,0.015747,-0.105362,0.171509,0.002566,0.050634,...,0.333299,-0.063083,0.188317,0.017313,0.343282,0.256595,0.021563,0.427071,-0.098763,0.207447
23214,0.070986,0.048851,0.101083,0.066231,-0.098585,-0.140182,-0.089963,0.147331,0.020894,0.250309,...,0.237544,-0.054575,0.202967,-0.067759,0.166367,0.162618,0.066444,0.179394,-0.262422,0.355086
19269,0.099716,-0.06655,0.196736,0.141789,-0.124994,-0.049255,-0.283008,0.175376,-0.149655,0.192325,...,0.165654,0.212479,0.015772,0.0448,0.164197,0.179401,0.115111,0.391958,-0.11206,0.402351


In [148]:
CNN_model.reset_states()
CNN_model.fit(w2v_X_train, w2v_y_train, epochs=10)

Epoch 1/10
 112/1129 [=>............................] - ETA: 8s - loss: 0.5834 - accuracy: 0.7631

InvalidArgumentError:  indices[9,63] = -1 is not in [0, 19645)
	 [[node functional_40/embedding_32/embedding_lookup (defined at <ipython-input-148-3b2b0ee89568>:2) ]] [Op:__inference_train_function_50915]

Errors may have originated from an input operation.
Input Source operations connected to node functional_40/embedding_32/embedding_lookup:
 functional_40/embedding_32/embedding_lookup/50621 (defined at /home/jupyter/anaconda3/lib/python3.8/contextlib.py:113)

Function call stack:
train_function
