# Loading

In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


# Processing

In [4]:
df.tweet[0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [5]:
df.tweet = df.tweet.str.replace('@user', '')
df.tweet = df.tweet.replace('[^a-zA-Z0-9% ]', '', regex = True)
df.tweet = df.tweet.str.strip()

def tokenize(tweet):
    return ' '.join(word_tokenize(tweet))

df['tweet'] = df.tweet.apply(tokenize)

In [6]:
df.tweet[0]

'when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run'

In [7]:
df = df[['tweet', 'label']]

# Splitting

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    df['tweet'],
    df['label'],
    test_size = 0.28,
    random_state = 40
)

# Vectorizing

#### 1. Bag of Words

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vectorizer_bog = CountVectorizer(ngram_range = (2, 2))

In [12]:
vectorizer_bog.fit(df['tweet'])

CountVectorizer(ngram_range=(2, 2))

In [13]:
train_bog = vectorizer_bog.transform(x_train)
test_bog = vectorizer_bog.transform(x_test)

In [14]:
vectorizer_bog.get_feature_names_out()

array(['0000001 polluting', '00027 photooftheday', '01 blog', ...,
       'zz plans', 'zzz just', 'zzzzzzzz be'], dtype=object)

#### 2. Term Frequency - Invert Document Frequency

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vectorizer_tfidf = TfidfVectorizer(ngram_range = (2, 2))

In [17]:
vectorizer_tfidf.fit(df['tweet'])

TfidfVectorizer(ngram_range=(2, 2))

In [18]:
train_tfidf = vectorizer_tfidf.transform(x_train)
test_tfidf = vectorizer_tfidf.transform(x_test)

In [19]:
vectorizer_tfidf.get_feature_names_out()

array(['0000001 polluting', '00027 photooftheday', '01 blog', ...,
       'zz plans', 'zzz just', 'zzzzzzzz be'], dtype=object)

# Training

In [20]:
from sklearn.naive_bayes import MultinomialNB

#### 1. Bag of Words

In [21]:
model_bog = MultinomialNB()

In [22]:
model_bog.fit(train_bog, y_train)

MultinomialNB()

#### 2. TF-IDF

In [23]:
model_tfidf = MultinomialNB()

In [24]:
model_tfidf.fit(train_tfidf, y_train)

MultinomialNB()

# Testing

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
accuracy_score(y_test, model_bog.predict(test_bog))

0.7760893854748603

In [27]:
accuracy_score(y_test, model_tfidf.predict(test_tfidf))

0.9420111731843576

# Conclusion

TFIDF performs a lot better with 94% accuracy