## 1. Import Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import re
import string

# Scikit-learn modules for machine learning and evaluation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# NLTK module for natural language processing
from nltk.corpus import stopwords

## 2. Collecting the data

In [2]:
df = pd.read_excel('twitter.xlsx')

## 3. Data Preprocessing

In [3]:
# Display the dataset
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?Ã°ÂŸÂ˜ÂÃ°ÂŸÂ˜ÂÃ°ÂŸÂ...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [4]:
# Display basic statistical details
df.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


### 3.1 Drop the 'id' column

In [5]:
df = df.drop(['id'], axis=1)

In [6]:
# Show data
df

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
...,...,...
31957,0,ate @user isz that youuu?Ã°ÂŸÂ˜ÂÃ°ÂŸÂ˜ÂÃ°ÂŸÂ...
31958,0,to see nina turner on the airwaves trying to...
31959,0,listening to sad songs on a monday morning otw...
31960,1,"@user #sikh #temple vandalised in in #calgary,..."


### 3.2 Add the length of each tweet as ‘length’ feature

In [7]:
df['length'] = df['tweet'].apply(len)

In [8]:
# Display the dataset with the new column
df

Unnamed: 0,label,tweet,length
0,0,@user when a father is dysfunctional and is s...,102
1,0,@user @user thanks for #lyft credit i can't us...,122
2,0,bihday your majesty,21
3,0,#model i love u take with u all the time in ...,118
4,0,factsguide: society now #motivation,39
...,...,...,...
31957,0,ate @user isz that youuu?Ã°ÂŸÂ˜ÂÃ°ÂŸÂ˜ÂÃ°ÂŸÂ...,110
31958,0,to see nina turner on the airwaves trying to...,131
31959,0,listening to sad songs on a monday morning otw...,63
31960,1,"@user #sikh #temple vandalised in in #calgary,...",67


### 3.3 Define ‘message_cleaning’ function

In [9]:
def message_cleaning(tweet):
    # Remove punctuation from the tweet
    tweet = ''.join([char for char in tweet if char not in string.punctuation])
    # Remove stopwords from the tweet
    tweet = ' '.join([word for word in tweet.split() if word.lower() not in stopwords.words('english')])
    return tweet

### 3.4 Apply the cleaning function

In [10]:
# Apply the cleaning function to the 'tweet' column
df['cleaned_tweet'] = df['tweet'].apply(message_cleaning)

### 3.5 Display the first tweet before and after cleaning

In [11]:
df.iloc[0][['tweet', 'cleaned_tweet']]

tweet             @user when a father is dysfunctional and is s...
cleaned_tweet    user father dysfunctional selfish drags kids d...
Name: 0, dtype: object

## 4. Convert tweets to vectors

In [12]:
vectorizer = CountVectorizer()
# Convert to vectorized form
X = vectorizer.fit_transform(df['cleaned_tweet'])
# Extract the target labels
y = df['label']  

### 4.1 Split the data

In [13]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Train the Naive Bayes classifier

In [14]:
# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

### 6. Evaluate the model

In [15]:
y_pred = model.predict(X_test)

# Print evaluation results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9436884091975598
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      5937
           1       0.61      0.57      0.59       456

    accuracy                           0.94      6393
   macro avg       0.79      0.77      0.78      6393
weighted avg       0.94      0.94      0.94      6393

Confusion Matrix:
 [[5775  162]
 [ 198  258]]
