## Import Libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import nltk
import string, re
import gc # garbage collector to manage RAM usage
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

%matplotlib inline

In [23]:
tweets = pd.read_csv('train 2.csv')
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [24]:
# Rename columns to avoid clashes with the words in vocab later
tweets.rename(columns={'label':'Label', 'tweet':'Tweet'}, inplace=True)

# Drop column 'id'
tweets.drop(columns='id', inplace=True)

# Check distribution of class of tweets
tweets.Label.value_counts()

0    29720
1     2242
Name: Label, dtype: int64

In [25]:
X_train, X_test, y_train, y_test = train_test_split(tweets['Tweet'], tweets['Label'], test_size=0.2, 
                                                    stratify=tweets['Label'], random_state=1)

# Combine the results into train and test dataframe
tweets_train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
tweets_test = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

print(f'the number of training data is {len(tweets_train)}')
print(f'the number of test data is {len(tweets_test)}')
tweets_train.head()

the number of training data is 25569
the number of test data is 6393


Unnamed: 0,Tweet,Label
0,this is chess #mlittle801 #breezeslc #fitness ...,0
1,@user @user @user @user @user i bet thats wh...,0
2,happiness is @user so find your happiness tod...,0
3,@user &lt;3 &lt;3 listen to my most beautiful ...,0
4,free @user !! #jesuismilo !!,0


## Data Preprocessing

In [26]:
def clean_tweet(tweet):
    """
    Tokenize tweets into words. Convert texts to lower case.
    Remove hashtags, punctuations, stopwords, website links, extra spaces, non-alphanumeric characters and 
    single character. Lemmatize texts.
    """
    twt_tokenizer = TweetTokenizer(strip_handles=True) # remove username handles
    tokens = [token for token in twt_tokenizer.tokenize(tweet)]
    
    # Combine stopwords and punctuation
    stops = stopwords.words("english") + list(string.punctuation)
    
    # Create lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    tokens_no_hashtag = [re.sub(r'#', '', token) for token in tokens]
    tokens_no_stopwords = [token.lower() for token in tokens_no_hashtag if token.lower() not in stops]
    tokens_no_url = [re.sub(r'http\S+', '', token) for token in tokens_no_stopwords]
    tokens_no_url = [re.sub(r'www\S+', '', token) for token in tokens_no_url]
    tokens_no_extra_space = [re.sub(r'\s\s+', '', token) for token in tokens_no_url]
    tokens_alnum = [token for token in tokens_no_extra_space if token.isalnum()]
    tokens_lemma = [lemmatizer.lemmatize(token) for token in tokens_alnum]
    tokens_final = [token for token in tokens_lemma if len(token) > 1]
    
    return tokens_final

In [29]:
tweets_train['Tweet'] = tweets_train['Tweet'].apply(clean_tweet)
tweets_train.head()

Unnamed: 0,Tweet,Label
0,"[chess, mlittle801, breezeslc, fitness, nofilt...",0
1,"[bet, thats, left, feeding, think, actually, b...",0
2,"[happiness, find, happiness, today, happiness,...",0
3,"[listen, beautiful, best, friend, singing, ama...",0
4,"[free, jesuismilo]",0


In [28]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yamato0615/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True