In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Loading data
data = pd.read_csv('Twitter/mLabel_tweets.csv')
data.head()

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,@cath__kath AstraZeneca is made with the kidne...,ingredients
1,1336808189677940736t,It begins. Please find safe alternatives to th...,side-effect
2,1329488407307956231t,"@PaolaQP1231 Well, I mean congratulations Covi...",side-effect
3,1364194604459900934t,@BorisJohnson for those of us that do not wish...,mandatory
4,1375938799247765515t,She has been trying to speak out: writing lett...,side-effect rushed


In [3]:
#Dataframe
df = data.copy()
df.shape

(9921, 3)

### Preprocessing

In [4]:
df = df.drop('ID' , axis = 1)
df.head()

Unnamed: 0,tweet,labels
0,@cath__kath AstraZeneca is made with the kidne...,ingredients
1,It begins. Please find safe alternatives to th...,side-effect
2,"@PaolaQP1231 Well, I mean congratulations Covi...",side-effect
3,@BorisJohnson for those of us that do not wish...,mandatory
4,She has been trying to speak out: writing lett...,side-effect rushed


In [5]:
#Cleaning text
import re

def cleaning_text(text):
    #link
    text = re.sub(r"https?://\S+", "" , text)
    #Hashtag
    text = re.sub(r"#\S+", "" , text)
    #mention
    text = re.sub(r"@\S+", "" , text)
    #Special Characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "" , text)
    #Number
    text = re.sub(r"\d" , "" , text)
    #lower
    text = text.lower()

    return text

In [6]:
df['tweet'] = df['tweet'].apply(cleaning_text)
df

Unnamed: 0,tweet,labels
0,astrazeneca is made with the kidney cells of ...,ingredients
1,it begins please find safe alternatives to thi...,side-effect
2,well i mean congratulations covid for being t...,side-effect
3,for those of us that do not wish a vaccine so...,mandatory
4,she has been trying to speak out writing lette...,side-effect rushed
...,...,...
9916,former pfizer chief scientific officer on expe...,side-effect
9917,not what the manufacturers are saying the man...,pharma
9918,thats a complete no for now on the oxfordastra...,none
9919,opinion vaccine side effects possible to have ...,side-effect


In [7]:
import neattext as nt
import neattext.functions as nfx 

In [8]:
#noise scan
df['tweet'].apply(lambda x:nt.TextFrame(x).noise_scan())

0       {'text_noise': 12.345679012345679, 'text_lengt...
1       {'text_noise': 5.072463768115942, 'text_length...
2       {'text_noise': 8.73015873015873, 'text_length'...
3       {'text_noise': 14.50980392156863, 'text_length...
4       {'text_noise': 7.003891050583658, 'text_length...
                              ...                        
9916    {'text_noise': 7.28476821192053, 'text_length'...
9917    {'text_noise': 8.98876404494382, 'text_length'...
9918    {'text_noise': 6.1946902654867255, 'text_lengt...
9919    {'text_noise': 8.661417322834646, 'text_length...
9920    {'text_noise': 9.574468085106384, 'text_length...
Name: tweet, Length: 9921, dtype: object

In [9]:
#Extract stopwords
df['tweet'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())

0          [is, made, with, the, of, a, back, in, the, s]
1               [it, please, to, this, about, after, via]
2       [well, i, for, being, the, first, ever, to, in...
3       [for, those, of, us, that, do, not, a, so, wil...
4       [she, has, been, to, out, to, to, been, everyt...
                              ...                        
9916    [former, on, i, have, no, that, we, are, in, t...
9917    [not, what, the, are, the, we, have, two, they...
9918                     [a, no, for, now, on, the, from]
9919    [side, to, have, in, it, always, it, to, be, t...
9920     [now, before, you, the, where, i, only, of, are]
Name: tweet, Length: 9921, dtype: object

In [10]:
#Remove stopwords
df['tweet'].apply(nfx.remove_stopwords)

0            astrazeneca kidney cells little girl aborted
1       begins find safe alternatives vaccine uk issue...
2       mean congratulations covid thing eradicate inf...
3       wish vaccine given vaccine passport abroad int...
4       trying speak writing letters government speaki...
                              ...                        
9916    pfizer chief scientific officer experimental c...
9917    manufacturers saying manufacturers dont recomm...
9918    thats complete oxfordastrazeneca vaccine swiss...
9919    opinion vaccine effects possible penicillin iv...
9920    resign destroy country live adults double vacc...
Name: tweet, Length: 9921, dtype: object

In [11]:
corpus = df['tweet'].apply(nfx.remove_stopwords)

In [12]:
#Encoding
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfid = TfidfVectorizer()
tfid

In [14]:
#Build Features
X = tfid.fit_transform(corpus).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
encoder = LabelEncoder()

In [17]:
y = encoder.fit_transform(df['labels'])
y

array([ 72, 209, 209, ..., 117, 209, 147])

In [18]:
#Splitting data
from sklearn.model_selection import train_test_split

In [19]:
X_train , X_test , y_train , y_test = train_test_split(X , y , train_size = 0.4 , test_size = 0.2 , random_state = 0)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance , ClassifierChain , LabelPowerset
from skmultilearn.adapt import MLkNN

In [21]:
#model
binary = BinaryRelevance(MultinomialNB())

In [22]:
#Training
binary.fit(X_train , y_train)

In [28]:
#Predictions
BR_prediction = binary.predict(X_test)
BR_prediction.toarray()

array([[209],
       [209],
       [ 45],
       ...,
       [209],
       [209],
       [209]])

In [26]:
#Accuracy
from sklearn.metrics import accuracy_score

In [31]:
accuracy_score(y_test , BR_prediction.toarray())

0.3239294710327456