In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


*Libraries and packages for Text-preprocessing.*

In [31]:
import pandas as pd
import nltk          
import string 
import re 

from nltk.tokenize import TweetTokenizer
import nltk
nltk.download('stopwords')
from wordcloud import WordCloud,STOPWORDS
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

*Read the csv file*

In [32]:
df = pd.read_csv('/content/data.csv', encoding= 'latin1')

   **DATETIME** 

  *   Deleting all rows that are not a 'datetime' type.

  *   Seperating timestamp into date and time columns.





In [33]:
# Deleting all rows that are not a 'datetime' type
df['tweet_timestamp'] = pd.to_datetime(df['tweet_timestamp'], errors='coerce')
df = df.dropna(subset=['tweet_timestamp'])

# Seperating timestamp into date and time columns
df['tweet_timestamp'] = pd.to_datetime(df['tweet_timestamp']) 
df = df.sort_values(['tweet_timestamp'])
df['date'] = df['tweet_timestamp'].astype(str).str.split(' ', expand=True)[0]
df['time'] = df['tweet_timestamp'].astype(str).str.split(' ', expand=True)[1]


**TEXT PREPROCESSING FOE VADER SENTIMENT ANALYSIS**
     (Five Heuristics that affect sentiment of a text.)

---



1.   Punctuation (eg:'?','!') are allowed.
2.   Capitalization ( eg:'I HATE YOU' is more negative than 'I hate you' )

1.   Degree modifiers ( eg: 'Batting lineup is **extremely** good' vs. 'Batting lineup is good'. The first sentence is clearly more positive than the latter which affects the sentiment score.)
2.   Constructive conjunction like 'but' shifts the polarity.

1.   Trigram examination to identify negation.


*   ' The food here isn't really that great'. Vader considers trigrams of words to identify negation i.e 'isn't really that', 'really that great' are some of the trigrams that will be formed. The first token containing isn't will negate the positive score in the next one.


The cell below performs basic data cleaning tasks. To seperate noise from the text URL, hashtags, username and other special characters have been removed using regex library.












In [34]:
# remove whitespace from text 
def remove_whitespace(Full_text): 
    line = re.sub('[\s]+', ' ', Full_text)
    return line 
# remove url
def remove_url(Full_text): 
    line = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',Full_text)
    line = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', Full_text)
    line = re.sub('https?://\S+|www\.\S+', '', Full_text)
    return line
# Remove mentions
def remove_mention(Full_text):
    line=re.sub(r'@\w+','',Full_text)
    return line 
    
# Remove hashtags
#def remove_hash(Full_text):
#    line=re.sub(r'#\w+','',Full_text)
#    return line

# Remove numbers
def remove_number(Full_text):
    line=re.sub(r'[0-9]+','',Full_text)
    return line
# Remove punctuations(except "?" )
def remove_punct(Full_text):
    line = re.sub(r'[]!"$%&\'()*+,./:;=#@[\\^_`{|}~-]+', '', Full_text)
    return line
#
def remove_thi_amp_ha_words(string):
    line=re.sub(r'\bamp\b|\bthi\b|\bha\b','',string)
    return line
# Remove non-ascii characters
def remove_non_ascii(Full_text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', Full_text)

VADER takes care of emojis and emoticons by converting them to their literal meaning. We have included this step to keep our input data same for K-means and VADER  approaches to evaluate sentiment.

In [35]:
# Thanks : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}
# Convert emoticons to words
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

Applying the text cleaning functions on the text data.


In [36]:
df['Full_text']=df['Full_text'].apply(lambda x:remove_whitespace(x))
df['Full_text']=df['Full_text'].apply(lambda x:convert_emoticons(x))
df['Full_text']=df['Full_text'].apply(lambda x:remove_mention(str(x)))
#df['Full_text']=df['Full_text'].apply(lambda x:remove_hash(str(x)))
df['Full_text']=df['Full_text'].apply(lambda x:remove_url(x))
df['Full_text']=df['Full_text'].apply(lambda x:remove_number(x))
df['Full_text']=df['Full_text'].apply(lambda x:remove_punct(x))
df['Full_text']=df['Full_text'].apply(lambda x:remove_thi_amp_ha_words(x))
df['Full_text']=df['Full_text'].apply(lambda x:remove_non_ascii(x))


Tokenizing the cleaned data using a special tweet tokenizer instead of word tokenizer. Both works in the same way, to split a sentence into words just that former keeps the hashtags intact.

In [37]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
df['tokens'] = df['Full_text'].apply(tknzr.tokenize)



*    Stopwords like 'a', 'the',  'we' etc. are very common and doesn't 
 contribute to the sentiment score.
*   Some other common words populating our corpus have also been added to the STOPWORDS library.



In [38]:
stopwords = nltk.corpus.stopwords.words('english')
# Adding irrelevant words in our STOPWORDS library
newStopWords = ['played','playing','u','A','today','if','th','would','To','dont','v','run','st','go','match','test','team','Test','I','series','cricket','The','Team','Th']
stopwords.extend(newStopWords)

def remove_stopwords(text):
    text = [word for word in text if word not in stopwords]
    return text
    
df['No_stopwords'] = df['tokens'].apply(lambda x: remove_stopwords(x))

**Lemmatizing the tokens**


*   Lemmatizer function reduces the word to its root form using its POS tag.
*   Eg: Running, walking,swimming to run, walk,swim.







In [39]:
lemmatizer = nltk.WordNetLemmatizer()

def lemmatize_words(text):
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

df['lemmatized'] = df['No_stopwords'].apply(lambda x: lemmatize_words(x))

Cell below join the tokens in the lemmatized list of words and subsequently extract it from the list as a single item string in column text_modified.

In [40]:
def join(text):
    text = [' '.join(str(j) for j in text)]
    return text

df['joined_string'] = df['lemmatized'].apply(lambda x: join(x))
df['text_modified'] =  df['joined_string'].apply(lambda x: x[0])

In [41]:
# Most frequent words      
from collections import Counter
cnt = Counter()
for text in df["text_modified"].values:
    for word in text.split():
        cnt[word] += 1

# Removing most frequent words
#FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
#def remove_freqwords(text):
   #    return " ".join([word for word in str(text).split() if word not in FREQWORDS])
        
cnt.most_common(10)
# Removing rare words
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df['text_modified']=df['text_modified'].apply(lambda x:remove_rarewords(x))
cnt.most_common(10)

[('India', 1843),
 ('?', 1143),
 ('Australia', 836),
 ('Indian', 829),
 ('win', 771),
 ('day', 717),
 ('Gabba', 543),
 ('player', 535),
 ('one', 493),
 ('like', 472)]

In [42]:
df.to_csv(r'C:\Users\91987\Desktop\Twitter Sentiment Analysis\File Name.csv', index = False)