In [11]:
# Importing the necessary libraries
import pandas as pd
import seaborn as sb
import matplotlib as plt
import numpy as np
import sklearn
import matplotlib.pyplot as mtplt
import re
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf

[nltk_data] Downloading package punkt to C:\Users\Aishwarya
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Aishwarya
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aishwarya
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Before post-processing
tweetDataPre = pd.read_csv('combined_tweettypes.csv', index_col=False)
# After post-processing
tweetData = pd.read_csv('postprocessed_output.csv', index_col=False)
tweetData

Unnamed: 0,index,tweet,tweettype
0,0,@ZubairSabirPTI pls dont insult the word 'Molna',negative
1,1,@ArcticFantasy I would have almost took offens...,negative
2,2,@IllinoisLoyalty that Rutgers game was an abom...,negative
3,3,@CozanGaming that's what lisa asked before she...,negative
4,4,Sometimes I get mad over something so minuscul...,negative
...,...,...,...
43955,43955,@JohnLloydTaylor,neutral
43956,43956,Happy Mothers Day All my love,positive
43957,43957,Happy Mother's Day to all the mommies out ther...,positive
43958,43958,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,positive


In [3]:
# Defining functions for cleaning and formatting data
# Function to count characters
def count_characters(tweet):
    return len(tweet)

# Function to count words
def count_words(tweet):
    return len(tweet.split())

# Function to count capital letters
def count_capital_characters(tweet):
    count=0
    for i in tweet:
        if i.isupper():
            count+=1
    return count

# Function to count capital words
def count_capital_words(tweet):
    return sum(map(str.isupper,tweet.split()))

# Function to count punctuations
def count_punctuations(tweet):
    punctuations='!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'
    d=dict()
    for i in punctuations:
        d[str(i)+' count']=tweet.count(i)
    return d 

# Function to count the number of words 
def count_words_in_quotes(tweet):
    x = re.findall("'.'|\".\" ", tweet)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count

# Function to count the number of sentences
def count_sentences(tweet):
    return len(nltk.sent_tokenize(tweet))

# Function to count unique words
def count_unique_words(tweet):
    return len(set(tweet.split()))

# Function to count hashtags
def count_hashtags(tweet):
    x = re.findall(r'(#w[A-Za-z0-9]*)', tweet)
    return len(x) 

# Function to count @ mentions
def count_mentions(tweet):
    x = re.findall(r'(@w[A-Za-z0-9]*)', tweet)
    return len(x)

# Function to count the number of stopwords
def count_stopwords(tweet):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(tweet)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

In [4]:
# Applying the transformation to create columns
tweetData['charCount'] = tweetData["tweet"].apply(lambda x:count_characters(x))
tweetData['wordCount'] = tweetData["tweet"].apply(lambda x:count_words(x))
tweetData['sentenceCount'] = tweetData["tweet"].apply(lambda x:count_sentences(x))
tweetData['capCharCount'] = tweetData["tweet"].apply(lambda x:count_capital_characters(x))
tweetData['capWordCount'] = tweetData["tweet"].apply(lambda x:count_capital_words(x))
tweetData['quotedWordCount'] = tweetData["tweet"].apply(lambda x:count_words_in_quotes(x))
tweetData['stopwordCount'] = tweetData["tweet"].apply(lambda x:count_stopwords(x))
tweetData['uniqueWordCount'] = tweetData["tweet"].apply(lambda x:count_unique_words(x))
tweetData['hashCount'] = tweetData["tweet"].apply(lambda x:count_hashtags(x))
tweetData['mentionCount'] = tweetData["tweet"].apply(lambda x:count_mentions(x))
tweetData['punctCount'] = tweetData["tweet"].apply(lambda x:count_punctuations(x))
tweetData['avgWordLen'] = tweetData['charCount']/tweetData['wordCount']

In [5]:
# View the newly created columns
tweetData

Unnamed: 0,index,tweet,tweettype,charCount,wordCount,sentenceCount,capCharCount,capWordCount,quotedWordCount,stopwordCount,uniqueWordCount,hashCount,mentionCount,punctCount,avgWordLen
0,0,@ZubairSabirPTI pls dont insult the word 'Molna',negative,49,7,1,6,0,0,1,7,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",7.000000
1,1,@ArcticFantasy I would have almost took offens...,negative,81,14,1,4,2,0,5,13,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.785714
2,2,@IllinoisLoyalty that Rutgers game was an abom...,negative,114,20,3,6,0,0,8,20,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.700000
3,3,@CozanGaming that's what lisa asked before she...,negative,90,16,2,3,1,0,7,16,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.625000
4,4,Sometimes I get mad over something so minuscul...,negative,133,25,1,3,2,0,7,22,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.320000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43955,43955,@JohnLloydTaylor,neutral,16,1,1,3,0,0,0,1,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",16.000000
43956,43956,Happy Mothers Day All my love,positive,30,6,1,4,0,0,1,6,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.000000
43957,43957,Happy Mother's Day to all the mommies out ther...,positive,123,25,1,3,0,0,15,23,0,0,"{'! count': 1, '"" count': 0, '# count': 0, '$ ...",4.920000
43958,43958,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,positive,122,19,5,83,18,0,0,19,0,0,"{'! count': 7, '"" count': 0, '# count': 0, '$ ...",6.421053


In [6]:
# Function to modulate a tweet
def featureEngineering(tweet):
    # Lower case tweet
    tweetMod = tweet.lower()
    # Replace URLs with a space in the message
    tweetMod = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', tweetMod)
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    tweetMod = re.sub('\$[a-zA-Z0-9]*', ' ', tweetMod)
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    tweetMod = re.sub('\@[a-zA-Z0-9]*', ' ', tweetMod)
    # Replace everything not a letter or apostrophe with a space
    tweetMod = re.sub('[^a-zA-Z\']', ' ', tweetMod)
    # Remove single letter words
    tweetMod = ' '.join( [w for w in tweetMod.split() if len(w)>1] )
    
    return tweetMod

# Process for all tweets
tweetData['modTweet'] = [featureEngineering(tweet) for tweet in tweetData['tweet']]

In [7]:
#View Tweet data
tweetData

Unnamed: 0,index,tweet,tweettype,charCount,wordCount,sentenceCount,capCharCount,capWordCount,quotedWordCount,stopwordCount,uniqueWordCount,hashCount,mentionCount,punctCount,avgWordLen,modTweet
0,0,@ZubairSabirPTI pls dont insult the word 'Molna',negative,49,7,1,6,0,0,1,7,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",7.000000,pls dont insult the word 'molna'
1,1,@ArcticFantasy I would have almost took offens...,negative,81,14,1,4,2,0,5,13,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.785714,would have almost took offense to this if actu...
2,2,@IllinoisLoyalty that Rutgers game was an abom...,negative,114,20,3,6,0,0,8,20,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.700000,that rutgers game was an abomination an affron...
3,3,@CozanGaming that's what lisa asked before she...,negative,90,16,2,3,1,0,7,16,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.625000,that's what lisa asked before she started ragi...
4,4,Sometimes I get mad over something so minuscul...,negative,133,25,1,3,2,0,7,22,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.320000,sometimes get mad over something so minuscule ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43955,43955,@JohnLloydTaylor,neutral,16,1,1,3,0,0,0,1,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",16.000000,
43956,43956,Happy Mothers Day All my love,positive,30,6,1,4,0,0,1,6,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.000000,happy mothers day all my love
43957,43957,Happy Mother's Day to all the mommies out ther...,positive,123,25,1,3,0,0,15,23,0,0,"{'! count': 1, '"" count': 0, '# count': 0, '$ ...",4.920000,happy mother's day to all the mommies out ther...
43958,43958,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,positive,122,19,5,83,18,0,0,19,0,0,"{'! count': 7, '"" count': 0, '# count': 0, '$ ...",6.421053,wassup beautiful follow me peep out my new hit...


In [8]:
# Function to tokenize the tweet
def tokenizeTweet(tweet, option):
  '''
  Tokenize the tweet, different methods - change as per accuracy and score requirements
    1: Use python split() function
    2: Use regex to extract alphabets plus 's and 't
    3: Use NLTK word_tokenize()
    4: Use NLTK word_tokenize(), remove stop words and apply lemmatization
  '''
  if option == 1:
    return tweet.split()
  elif option == 2:
    return re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', tweet)
  elif option == 3:
    return [word for word in word_tokenize(tweet) if (word.isalpha()==1)]
  elif option == 4:
    words = [word for word in word_tokenize(tweet) if (word.isalpha()==1)]
    # Remove stop words
    stop = set(stopwords.words('english'))
    words = [word for word in words if (word not in stop)]
    # Lemmatize words (first noun, then verb)
    wnl = nltk.stem.WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(wnl.lemmatize(word, 'n'), 'v') for word in words]
    return lemmatized
  else:
    return []

# Choosing to lemmatize the text
tweetData['modTweet'] = [tokenizeTweet(tweet, 4) for tweet in tweetData['tweet']]

In [9]:
# View tweet data
tweetData

Unnamed: 0,index,tweet,tweettype,charCount,wordCount,sentenceCount,capCharCount,capWordCount,quotedWordCount,stopwordCount,uniqueWordCount,hashCount,mentionCount,punctCount,avgWordLen,modTweet
0,0,@ZubairSabirPTI pls dont insult the word 'Molna',negative,49,7,1,6,0,0,1,7,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",7.000000,"[ZubairSabirPTI, pls, dont, insult, word]"
1,1,@ArcticFantasy I would have almost took offens...,negative,81,14,1,4,2,0,5,13,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.785714,"[ArcticFantasy, I, would, almost, take, offens..."
2,2,@IllinoisLoyalty that Rutgers game was an abom...,negative,114,20,3,6,0,0,8,20,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.700000,"[IllinoisLoyalty, Rutgers, game, abomination, ..."
3,3,@CozanGaming that's what lisa asked before she...,negative,90,16,2,3,1,0,7,16,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.625000,"[CozanGaming, lisa, ask, start, rag, I, call, ..."
4,4,Sometimes I get mad over something so minuscul...,negative,133,25,1,3,2,0,7,22,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.320000,"[Sometimes, I, get, mad, something, minuscule,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43955,43955,@JohnLloydTaylor,neutral,16,1,1,3,0,0,0,1,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",16.000000,[JohnLloydTaylor]
43956,43956,Happy Mothers Day All my love,positive,30,6,1,4,0,0,1,6,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",5.000000,"[Happy, Mothers, Day, All, love]"
43957,43957,Happy Mother's Day to all the mommies out ther...,positive,123,25,1,3,0,0,15,23,0,0,"{'! count': 1, '"" count': 0, '# count': 0, '$ ...",4.920000,"[Happy, Mother, Day, mommy, woman, man, long, ..."
43958,43958,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,positive,122,19,5,83,18,0,0,19,0,0,"{'! count': 7, '"" count': 0, '# count': 0, '$ ...",6.421053,"[niariley, WASSUP, BEAUTIFUL, FOLLOW, ME, PEEP..."


In [12]:
# Create the labels array for the y values
labels = np.array(tweetData['tweettype'])
y = []
for i in range(len(labels)):
    if labels[i] == 'positive':
        y.append(0)
    elif labels[i] == 'negative':
        y.append(1)
    elif labels[i] == 'neutral':
        y.append(2)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 3, dtype="float32")
del y

In [13]:
#Function to count punctuations
def count_punctuations(tweet):
    punctuations='!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'
    d=dict()
    for i in punctuations:
        d[str(i)+' count']=tweet.count(i)
    return d

# Creating unique columns for each punctuations
for i, tweet in enumerate(tweetData['tweet']):
  dictPunct = count_punctuations(tweet)
  for key,val in dictPunct.items():
    tweetData.loc[i,key] = val

In [18]:
# View dataframe information
tweetData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43960 entries, 0 to 43959
Data columns (total 47 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            43960 non-null  int64  
 1   tweet            43960 non-null  object 
 2   tweettype        43960 non-null  object 
 3   charCount        43960 non-null  int64  
 4   wordCount        43960 non-null  int64  
 5   sentenceCount    43960 non-null  int64  
 6   capCharCount     43960 non-null  int64  
 7   capWordCount     43960 non-null  int64  
 8   quotedWordCount  43960 non-null  int64  
 9   stopwordCount    43960 non-null  int64  
 10  uniqueWordCount  43960 non-null  int64  
 11  hashCount        43960 non-null  int64  
 12  mentionCount     43960 non-null  int64  
 13  avgWordLen       43960 non-null  float64
 14  modTweet         43960 non-null  object 
 15  ! count          43960 non-null  float64
 16  " count          43960 non-null  float64
 17  # count     

In [16]:
# Dropping the punctuation count column
tweetData = tweetData.drop('punctCount', axis = 1)

In [22]:
#Creating a set of unique words in the dataset
wordsSet = set()
for tweet in tweetData['modTweet']:
  listOfWords = tokenizeTweet(tweet, 4)
  wordsSet.update(listOfWords)

In [23]:
#Viewing the list of unique words
print(wordsSet)

{'gettt', 'kutcher', 'jasmi', 'brady', 'woopee', 'instead', 'upside', 'birder', 'magician', 'penis', 'foresee', 'cancellation', 'boulevard', 'translation', 'prebooked', 'martwo', 'yumb', 'sambradley', 'rescue', 'redefine', 'daya', 'trasfusion', 'coincidently', 'blackhawks', 'ubook', 'ranjan', 'jjj', 'iprocrastinate', 'yezzzir', 'vopseaua', 'kanin', 'euphamism', 'doneeeee', 'twitterloves', 'ani', 'confsuing', 'loveeee', 'mie', 'calfornia', 'nxt', 'georgia', 'guuud', 'elate', 'rtr', 'whistler', 'spaz', 'woo', 'lovable', 'liesboystell', 'thisisagoddream', 'excercise', 'length', 'picturisation', 'gene', 'jakki', 'symptons', 'ferman', 'unconference', 'silence', 'civilian', 'stopbullying', 'lenovo', 'erie', 'empanadas', 'joh', 'ketchup', 'doin', 'solid', 'baggage', 'homegirl', 'gooooooooood', 'memeber', 'dhoni', 'funhouse', 'lovve', 'vinyl', 'ima', 'depeche', 'prix', 'skyscraper', 'klemm', 'jeeeez', 'quirk', 'pokey', 'cookout', 'devout', 'toxic', 'shooeessss', 'chu', 'tunnel', 'bprohibiting'

In [25]:
#Total number of unique words
print(len(wordsSet))

27608


In [26]:
#Creating the lemmatized text
def lemmatizeTweet(tweet):
  words = [word for word in word_tokenize(tweet) if (word.isalpha()==1)]
  # Remove stop words
  stop = set(stopwords.words('english'))
  words = [word for word in words if (word not in stop)]
  # Lemmatize words (first noun, then verb)
  wnl = nltk.stem.WordNetLemmatizer()
  lemmatized = [wnl.lemmatize(wnl.lemmatize(word, 'n'), 'v') for word in words]
  return " ".join(lemmatized)

tweetData['lemmatizedText'] = tweetData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [27]:
#Viewing the dataframe
tweetData

Unnamed: 0,index,tweet,tweettype,charCount,wordCount,sentenceCount,capCharCount,capWordCount,quotedWordCount,stopwordCount,...,\ count,] count,^ count,_ count,` count,{ count,| count,} count,~ count,lemmatizedText
0,0,@ZubairSabirPTI pls dont insult the word 'Molna',negative,49,7,1,6,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pls dont insult word
1,1,@ArcticFantasy I would have almost took offens...,negative,81,14,1,4,2,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,would almost take offense actually snap
2,2,@IllinoisLoyalty that Rutgers game was an abom...,negative,114,20,3,6,0,0,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rutgers game abomination affront god man must ...
3,3,@CozanGaming that's what lisa asked before she...,negative,90,16,2,3,1,0,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lisa ask start rag call heh
4,4,Sometimes I get mad over something so minuscul...,negative,133,25,1,3,2,0,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sometimes get mad something minuscule try ruin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43955,43955,@JohnLloydTaylor,neutral,16,1,1,3,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
43956,43956,Happy Mothers Day All my love,positive,30,6,1,4,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,happy mother day love
43957,43957,Happy Mother's Day to all the mommies out ther...,positive,123,25,1,3,0,0,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,happy mother day mommy woman man long someone day
43958,43958,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,positive,122,19,5,83,18,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,wassup beautiful follow peep new hit single ww...


In [28]:
tweetData.to_csv('feature-engineered-final.csv') #Export to csv