In [535]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [536]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

In [537]:
#loading the dataset and in read_csv, encoding has been used to decode the file as there are some characters that are compatible with the provided encoding
df = pd.read_csv("/Users/User/Downloads/Sentiment_Data.csv", encoding="ISO-8859-1")

In [538]:
df.head()

Unnamed: 0,Tweet,Sentiment
0,@_angelica_toy Happy Anniversary!!!....The Day...,Mild_Pos
1,@McfarlaneGlenda Happy Anniversary!!!....The D...,Mild_Pos
2,@thevivafrei @JustinTrudeau Happy Anniversary!...,Mild_Pos
3,@NChartierET Happy Anniversary!!!....The Day t...,Mild_Pos
4,@tabithapeters05 Happy Anniversary!!!....The D...,Mild_Pos


In [539]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451332 entries, 0 to 451331
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Tweet      451331 non-null  object
 1   Sentiment  451332 non-null  object
dtypes: object(2)
memory usage: 6.9+ MB


## Checking Missing Values

In [540]:
# Identify the empty rows
empty_rows = df[df['Tweet'].isnull()]
print("Empty Rows based on the 'Tweet' column:")
print(empty_rows)

Empty Rows based on the 'Tweet' column:
      Tweet Sentiment
75986   NaN   Neutral


#### The above row doesn't have any tweet and just having a value on Sentiment which doesn't provide us any useful information that why the sentiment is consisdered as Neutral. So, it's better to remove this one.

In [541]:
#dropping the row
df.drop(75986,axis=0,inplace=True)

In [542]:
df.shape

(451331, 2)

In [543]:
#### While exploring the data, in few rows we encountered with "#NAME?" which is not useful in our sentiment analysis, so we are dropping these rows.

In [544]:
# Count the number of rows with "#NAME?" in the "Tweet" column
num_name_rows=(df['Tweet']=='#NAME?').sum()
print("Number of Rows with '#NAME?':", num_name_rows)

Number of Rows with '#NAME?': 5


In [545]:
name_rows=df[df['Tweet'] == '#NAME?']
name_rows

Unnamed: 0,Tweet,Sentiment
1029,#NAME?,Mild_Pos
34287,#NAME?,Strong_Pos
284517,#NAME?,Strong_Pos
318448,#NAME?,Strong_Pos
407118,#NAME?,Strong_Pos


In [546]:
# Remove rows with "#NAME?" in the "Tweet" column inplace
df.drop(df[df['Tweet'] == '#NAME?'].index, inplace=True)

In [547]:
df.shape

(451326, 2)

## Checking Duplicates

In [548]:
# Count the number of duplicate rows
num_duplicate_rows = df.duplicated().sum()
print("Number of Duplicate Rows:", num_duplicate_rows)

Number of Duplicate Rows: 33


In [549]:
#checking duplicates
duplicate_rows=df[df.duplicated()]
print(duplicate_rows)

                                                    Tweet   Sentiment
4374    @unacceptfringe @thevivafrei @ShadoeDavis @Tru...    Mild_Pos
9471    @thelifeofalvo @usher933 @Baba_Ganoushy @Huddy...  Strong_Pos
14756   @TinfoilhatNick @WarcampaignYT @rkpall @PanSoy...  Strong_Pos
18440   @kylegriffin1 You all were behind vaccine pass...  Strong_Pos
29341   @WeAreCanProud If you support the demonstratio...  Strong_Pos
48723   @NChartierET Deering &amp; Braun haven't filed...  Strong_Pos
58390   @Brent_Secord @JasonLavigneMP @NadineGNess @rr...  Strong_Pos
61472   @acoyne Two-thirds of Canadians support use of...  Strong_Pos
61513   @brianlilley Two-thirds of Canadians support u...  Strong_Pos
70312   @nationalpost Two-thirds of Canadians support ...  Strong_Pos
73873   @globeandmail Two-thirds of Canadians support ...  Strong_Pos
74580   @Qualifyfor @Derricktgoat ðââï¸The Win...  Strong_Pos
77729   @MattWalshBlog Quoting #MattWalsh's own words ...    Mild_Pos
78596   @CTVNews The

#### After doing the research on keeping the duplicates or removing them, we would consider to remove them as we want to ensure more robust and unbaised model.

In [550]:
# Removing duplicate rows
df = df.drop_duplicates()

In [551]:
# Check the distribution of the sentiment column
print("Sentiment distribution:")
print(df['Sentiment'].value_counts())

Sentiment distribution:
Sentiment
Strong_Pos    233673
Neutral        77012
Mild_Pos       63999
Strong_Neg     42555
Mild_Neg       34054
Name: count, dtype: int64


## Removing patterns, mentions, url's, special characters, numbers and punctuations

In [552]:
# removes pattern in the input text
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [553]:
#removing mentions
df['clean_tweet'] = df['Tweet'].apply(lambda x: remove_pattern(x, "@[\w]*"))

In [554]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
df = pd.DataFrame(df)

# Apply the function to remove URLs
df['clean_tweet'] = df['clean_tweet'].apply(remove_urls)

In [555]:
#Removing special characters, numbers and punctuations
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]", " ", regex=True)

In [556]:
import wordninja
# Function to clean tweet and split hashtags
def clean_tweet(tweet):
    # Split the tweet into words
    words = tweet.split()
    
    # Initialize a list to hold the cleaned words
    cleaned_words = []
    
    # Iterate through each word
    for word in words:
        # If the word is a hashtag
        if word.startswith('#'):
            # Remove the hash symbol and split the hashtag into words
            split_words = wordninja.split(word[1:])
            # Add the split words to the cleaned words list
            cleaned_words.extend(split_words)
        else:
            # Add the word as is to the cleaned words list
            cleaned_words.append(word)
    
    return ' '.join(cleaned_words)

# Apply the clean_tweet function to the 'clean_tweet' column
df['clean_tweet'] = df['clean_tweet'].apply(clean_tweet)

## Converting into lowercase

In [557]:
#convert to lowercase
df['clean_tweet'] = df['clean_tweet'].str.lower()

## Removing retweets

In [558]:
import pandas as pd

# Assuming df is your DataFrame containing the 'clean_tweet' column

# Remove 'rt' from the sentences
df['clean_tweet'] = df['clean_tweet'].str.replace(r'\brt\b', '', regex=True)

# Display the updated DataFrame
print(df)


                                                    Tweet   Sentiment  \
0       @_angelica_toy Happy Anniversary!!!....The Day...    Mild_Pos   
1       @McfarlaneGlenda Happy Anniversary!!!....The D...    Mild_Pos   
2       @thevivafrei @JustinTrudeau Happy Anniversary!...    Mild_Pos   
3       @NChartierET Happy Anniversary!!!....The Day t...    Mild_Pos   
4       @tabithapeters05 Happy Anniversary!!!....The D...    Mild_Pos   
...                                                   ...         ...   
451327  Gaza; Peace n' Freedom - Viva Palestina convoy...  Strong_Pos   
451328  Face of Defense: Soldier Finds Freedom in U.S....  Strong_Pos   
451329  Face of Defense: Soldier Finds Freedom in U.S....  Strong_Pos   
451330  Gaza; Peace n' Freedom - "Israel stops aid con...  Strong_Pos   
451331             @convoy 83 yes! get on freedom server!  Strong_Pos   

                                              clean_tweet  
0       happy anniversary the day the freedumb died in...  
1  

## Checking for repeated words

In [559]:
# Function to normalize repeated characters
def normalize_repeated_characters(text):
    return re.sub(r'^(?!.*(.)\1).{4,10}$', text)


## Replacing Slang Words

In [560]:
slang_dict = {
    'brb': 'be right back',
    'btw': 'by the way',
    'idk': "I don't know",
    'lol': 'laugh out loud',
    'omg': 'oh my god',
    'ttyl': 'talk to you later',
    'u': 'you',
    'ur': 'you are',
    'lmk': 'let me know',
    'smh': 'shaking my head',
    'tbh': 'to be honest',
    'rofl': 'rolling on the floor laughing',
    'wtf': 'what the f***',
    'bff': 'best friends forever',
    'fyi': 'for your information',
    'jk': 'just kidding',
    'np': 'no problem',
    'omw': 'on my way',
    'rn': 'right now',
    'thx': 'thanks',
    'afaik': 'as far as I know',
    'b4': 'before',
    'cya': 'see you',
    'gr8': 'great',
    'msg': 'message',
    'nvm': 'never mind',
    'plz': 'please',
    'sry': 'sorry',
    'w/': 'with',
    'w/o': 'without',
    'yolo': 'you only live once',
    'wya': 'where you at',
    'fomo': 'fear of missing out',
    'ikr': 'I know, right?',
    'imho': 'in my humble opinion',
    'irl': 'in real life',
    'lit': 'exciting or excellent',
    'hmu': 'hit me up',
    'bae': 'before anyone else',
    'n': 'and',
    'r': 'are',
    'b': 'with',
    'z': 'the',
    'th': 'the',
    'int': 'international',
    'tha': 'that',
    'ppl': 'people',
    'cdn': 'canadians',
    'jan': 'january'
}
# Function to replace slang words using the slang dictionary
def replace_slang(text, slang_dict):
    words = text.split()
    replaced_words = [slang_dict[word.lower()] if word.lower() in slang_dict else word for word in words]
    return " ".join(replaced_words)

# Apply replace_slang function to a DataFrame column
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: replace_slang(x, slang_dict))


## Mapping Contractions to Expanded Forms

In [561]:
# Dictionary mapping contractions to their expanded forms
contractions_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

# Function to expand contractions in a given text using the contractions dictionary
def expand_contractions(text, contractions_dict):
    # Regular expression pattern to find contractions in text
    pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')

    # Function to expand a matched contraction using the dictionary
    def expand_match(contraction):
        match = contraction.group(0)
        expanded = contractions_dict.get(match)
        if not expanded:
            expanded = contractions_dict.get(match.lower())
        return expanded

    # Apply the contraction expansion function to the text
    expanded_text = pattern.sub(expand_match, text)
    return expanded_text

# Apply contraction expansion to all cells of the DataFrame
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: expand_contractions(x, contractions_dict))

## Handling negatations

In [562]:
# Define a function to remove _neg suffix from each word in a string
def remove_neg_suffix(text):
    return ' '.join([word[:-4] if word.endswith('_neg') else word for word in text.split()])

# Apply the function to the 'clean_tweet' column
df['clean_tweet'] = df['clean_tweet'].apply(remove_neg_suffix)


#### Checking duplicates again as after cleaning some tweets are retweeted

In [563]:
# Count the number of duplicate rows
num_duplicate_rows = df.duplicated().sum()
print("Number of Duplicate Rows:", num_duplicate_rows)

Number of Duplicate Rows: 0


In [564]:
#checking duplicates
duplicate_rows=df[df.duplicated()]
print(duplicate_rows)

Empty DataFrame
Columns: [Tweet, Sentiment, clean_tweet]
Index: []


In [565]:
# Removing duplicate rows
df = df.drop_duplicates()

In [566]:
df.shape

(451293, 3)

#### Now we are checking the duplicates together for 'clean_tweet' and 'Sentiment' columns

In [567]:
# Count the number of duplicate rows based on 'clean_tweet' and 'Sentiment' columns
num_duplicate_rows = df.duplicated(subset=['clean_tweet', 'Sentiment']).sum()

# Display the number of duplicate rows
print("Number of duplicate rows:", num_duplicate_rows)


Number of duplicate rows: 77404


In [568]:
# Filter the DataFrame to show only the duplicate rows based on 'clean_tweet' and 'Sentiment' columns
duplicate_rows = df[df.duplicated(subset=['clean_tweet', 'Sentiment'], keep=False)]

# Display the duplicate rows
print(duplicate_rows)


                                                    Tweet   Sentiment  \
0       @_angelica_toy Happy Anniversary!!!....The Day...    Mild_Pos   
1       @McfarlaneGlenda Happy Anniversary!!!....The D...    Mild_Pos   
2       @thevivafrei @JustinTrudeau Happy Anniversary!...    Mild_Pos   
3       @NChartierET Happy Anniversary!!!....The Day t...    Mild_Pos   
4       @tabithapeters05 Happy Anniversary!!!....The D...    Mild_Pos   
...                                                   ...         ...   
451292  Outrageous :humanitarian aid convoy denied ent...  Strong_Neg   
451295  RT: @netraKL RT @juanajaafar: you can count on...  Strong_Pos   
451296  RT @juanajaafar: you can count on Israel and i...  Strong_Pos   
451307  .@rebeccay #GFM is Gaza Freedom March, a convo...  Strong_Pos   
451308  @rebeccay #GFM is Gaza Freedom March, a convoy...  Strong_Pos   

                                              clean_tweet  
0       happy anniversary the day the freedumb died in...  
1  

In [569]:
# Remove duplicate rows based on 'clean_tweet' and 'Sentiment' columns
df = df.drop_duplicates(subset=['clean_tweet', 'Sentiment'], keep='first')

# Display the cleaned DataFrame
print(df)


                                                    Tweet   Sentiment  \
0       @_angelica_toy Happy Anniversary!!!....The Day...    Mild_Pos   
8       Freedom Convoy as InkBlot Test https://t.co/au...  Strong_Pos   
13      @mark_slapinski Well itâs pretty easy to see...  Strong_Pos   
23      @JustinTrudeau You Belong In Jail.\n#VaccineMa...     Neutral   
25      #FreeDumbConvoy #FreedomConvoy #Freedumbers #f...     Neutral   
...                                                   ...         ...   
451327  Gaza; Peace n' Freedom - Viva Palestina convoy...  Strong_Pos   
451328  Face of Defense: Soldier Finds Freedom in U.S....  Strong_Pos   
451329  Face of Defense: Soldier Finds Freedom in U.S....  Strong_Pos   
451330  Gaza; Peace n' Freedom - "Israel stops aid con...  Strong_Pos   
451331             @convoy 83 yes! get on freedom server!  Strong_Pos   

                                              clean_tweet  
0       happy anniversary the day the freedumb died in...  
8  

In [570]:
df.shape

(373889, 3)

## Tokenization

In [571]:
df['clean_tweet'] = df['clean_tweet'].str.split()

## Removing stopwords

In [572]:
#removing stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: [word for word in x if word not in stop])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [573]:
df.head()

Unnamed: 0,Tweet,Sentiment,clean_tweet
0,@_angelica_toy Happy Anniversary!!!....The Day...,Mild_Pos,"[happy, anniversary, day, freedumb, died, tune..."
8,Freedom Convoy as InkBlot Test https://t.co/au...,Strong_Pos,"[freedom, convoy, inkblot, test]"
13,@mark_slapinski Well itâs pretty easy to see...,Strong_Pos,"[well, pretty, easy, see, agenda, pierre, rema..."
23,@JustinTrudeau You Belong In Jail.\n#VaccineMa...,Neutral,"[belong, jail, vaccine, mandates, crimes, huma..."
25,#FreeDumbConvoy #FreedomConvoy #Freedumbers #f...,Neutral,"[free, dumb, convoy, freedom, convoy, free, du..."


## Lemmatization

In [574]:
import nltk
from nltk.stem import WordNetLemmatizer

# Download WordNet data
nltk.download('wordnet')

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize a list of words
def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

# Apply lemmatization to the 'clean_tweet' column
df['clean_tweet'] = df['clean_tweet'].apply(lemmatize_words)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [577]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join(x))

## Removing whitespaces

In [578]:
# Function to clean white spaces in text
def clean_whitespace(text):
    if isinstance(text, str):  # Check if the value is a string
        # Remove leading and trailing spaces
        cleaned_text = text.strip()
        # Replace multiple spaces with a single space using regex
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text
    else:
        return text  # Return non-string values as is

# Apply whitespace cleaning function to all text columns in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column contains text data
        df[col] = df[col].apply(clean_whitespace)