# 2. Sentiment Analysis
### 2.0 Importing Libraries and Data

In [2]:
! pip install numpy pandas matplotlib seaborn dask textblob nltk

# Import libraries
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import textblob
import nltk
import dask.dataframe as dd

from zipfile import ZipFile

# Assert that Python version is 3.7 or higher
assert (sys.version_info.major == 3 and sys.version_info.minor >= 7), "This notebook requires Python 3.7 or higher."



In [3]:
# Import dataset to a dataframe
tweets = pd.read_csv('../data/modified-tweets.csv', encoding='latin-1')
tweets.head(3)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire


In [4]:
# Read the dataframe again and set column names
tweets = pd.read_csv('../data/modified-tweets.csv', encoding='latin-1', names=['zero', 'id', 'date', 'query', 'user', 'text'])
tweets.head(3)

Unnamed: 0,zero,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


### 2.1 Cleaning the Data  
**2.1.1 Removing Duplicates and Useless Data**

In [5]:
print(tweets['zero'].value_counts())
print(tweets['query'].value_counts())

0    800000
4    800000
Name: zero, dtype: int64
NO_QUERY    1600000
Name: query, dtype: int64


In [6]:
# Remove 'zero' and 'query' columns
tweets = tweets.drop(['zero', 'query'], axis=1)
tweets.head(3)

Unnamed: 0,id,date,user,text
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
2,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...


In [7]:
# Find ids that appear more than once
tweets[tweets['id'].duplicated(keep=False)].sort_values('id').head()

Unnamed: 0,id,date,user,text
213,1467863684,Mon Apr 06 22:33:35 PDT 2009,DjGundam,Awwh babs... you look so sad underneith that s...
800261,1467863684,Mon Apr 06 22:33:35 PDT 2009,DjGundam,Awwh babs... you look so sad underneith that s...
275,1467880442,Mon Apr 06 22:38:04 PDT 2009,iCalvin,Haven't tweeted nearly all day Posted my webs...
800300,1467880442,Mon Apr 06 22:38:04 PDT 2009,iCalvin,Haven't tweeted nearly all day Posted my webs...
989,1468053611,Mon Apr 06 23:28:09 PDT 2009,mariejamora,@hellobebe I also send some updates in plurk b...


In [8]:
# Remove duplicates
tweets = tweets.drop_duplicates(subset='id', keep='first')
tweets.head(3)

Unnamed: 0,id,date,user,text
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
2,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...


**2.1.2 Converting Data Types**

In [9]:
print(tweets.shape)

# Find how many rows contain 'PDT' in date
tweets[tweets['date'].str.contains('PDT')].shape

(1598315, 4)


(1598315, 4)

The string `PDT` causes an error when trying to convert the date to a datetime object. Since all tweets are from the same timezone, we will remove this string from the date column.

In [10]:
# Find data type of each column
print(tweets.dtypes)

# Remove 'PDT' from date column
tweets['date'] = tweets['date'].str.replace('PDT ', '')

# Convert the 'date' column to datetime and 'user' and 'text' to string
tweets['date'] = pd.to_datetime(tweets['date'], format='%a %b %d %H:%M:%S %Y')
tweets['user'] = tweets['user'].astype(str)
tweets['text'] = tweets['text'].astype(str)

# Reset index
tweets = tweets.reset_index(drop=True)
tweets.head(3)

id       int64
date    object
user    object
text    object
dtype: object


Unnamed: 0,id,date,user,text
0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...
2,1467810917,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...


**2.1.3 Handling Upper/lower Case**

In [11]:
# Convert all entries in text to lowercase
tweets['text'] = tweets['text'].str.lower()

In [12]:
# Find count of missing values in each column
tweets.isnull().sum()

id      0
date    0
user    0
text    0
dtype: int64

### 2.2 Using Preprocessing Techniques

In [13]:
# Make a column of array of words from text
tweets['words'] = tweets['text'].str.split()
tweets.head(3)

Unnamed: 0,id,date,user,text,words
0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...","[@switchfoot, http://twitpic.com/2y1zl, -, aww..."
1,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his facebook by ...,"[is, upset, that, he, can't, update, his, face..."
2,1467810917,2009-04-06 22:19:53,mattycus,@kenichan i dived many times for the ball. man...,"[@kenichan, i, dived, many, times, for, the, b..."


**2.2.1 Removing Links and References**

In [14]:
# Remove URLs from words
tweets['words'] = tweets['words'].apply(lambda x: [word for word in x if not word.startswith('http')])
# Remove @usernames from words
tweets['words'] = tweets['words'].apply(lambda x: [word for word in x if not word.startswith('@')])
tweets.head(3)

Unnamed: 0,id,date,user,text,words
0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...","[-, awww,, that's, a, bummer., you, shoulda, g..."
1,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his facebook by ...,"[is, upset, that, he, can't, update, his, face..."
2,1467810917,2009-04-06 22:19:53,mattycus,@kenichan i dived many times for the ball. man...,"[i, dived, many, times, for, the, ball., manag..."


**2.2.2 Removing Punctuation and Lemmatization**

In [26]:
nltk.download('wordnet')

# Remove punctuation from words
tweets['words'] = tweets['words'].apply(lambda x: [word.strip('"?,.-;') for word in x])
# Remove empty strings from words
tweets['words'] = tweets['words'].apply(lambda x: [word for word in x if len(word) > 0])
# Use lemmatization to reduce words to their root form
tweets['words'] = tweets['words'].apply(lambda x: [textblob.Word(word).lemmatize() for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amirsolei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
# Change worlds with 't apostrophe to two words
tweets['words'] = tweets['words'].apply(lambda x: [word.replace("'t", " not") for word in x])
# Change worlds with 's apostrophe to two words
tweets['words'] = tweets['words'].apply(lambda x: [word.replace("'s", " is") for word in x])
# Change worlds with 're apostrophe to two words
tweets['words'] = tweets['words'].apply(lambda x: [word.replace("'re", " are") for word in x])
# Change worlds with 've apostrophe to two words
tweets['words'] = tweets['words'].apply(lambda x: [word.replace("'ve", " have") for word in x])
# Change worlds with 'm apostrophe to two words
tweets['words'] = tweets['words'].apply(lambda x: [word.replace("'m", " am") for word in x])
# Change worlds with 'll apostrophe to two words
tweets['words'] = tweets['words'].apply(lambda x: [word.replace("'ll", " will") for word in x])
# Change worlds with 'd apostrophe to two words
tweets['words'] = tweets['words'].apply(lambda x: [word.replace("'d", " would") for word in x])

# Split words into two words if they contain a whitespace
tweets['words'] = tweets['words'].apply(lambda x: [word.split() for word in x])
tweets.head(3)

Unnamed: 0,id,date,user,text,words
0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...","[[awww], [that], [is], [a], [bummer], [you], [..."
1,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his facebook by ...,"[[is], [upset], [that], [he], [can], [not], [u..."
2,1467810917,2009-04-06 22:19:53,mattycus,@kenichan i dived many times for the ball. man...,"[[i], [dived], [many], [time], [for], [the], [..."


In [28]:
# Flatten the list of words
tweets['words'] = tweets['words'].apply(lambda x: [item for sublist in x for item in sublist])
tweets.head(3)

Unnamed: 0,id,date,user,text,words
0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...","[awww, that, is, a, bummer, you, shoulda, got,..."
1,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his facebook by ...,"[is, upset, that, he, can, not, update, his, f..."
2,1467810917,2009-04-06 22:19:53,mattycus,@kenichan i dived many times for the ball. man...,"[i, dived, many, time, for, the, ball, managed..."


In [29]:
# Lemmatize words using WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()
tweets['words'] = tweets['words'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
tweets.head(3)

Unnamed: 0,id,date,user,text,words
0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...","[awww, that, is, a, bummer, you, shoulda, got,..."
1,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his facebook by ...,"[is, upset, that, he, can, not, update, his, f..."
2,1467810917,2009-04-06 22:19:53,mattycus,@kenichan i dived many times for the ball. man...,"[i, dived, many, time, for, the, ball, managed..."


**2.2.3 Stopwords Removal and Stemming**

In [37]:
# Import progress bar (since this takes a while)
from tqdm import tqdm

# Count all words in tweets
all_words = [word for tokens in tweets['words'] for word in tokens]
len(all_words)

20649515

In [42]:
nltk.download('stopwords')
pd.options.mode.chained_assignment = None

# Remove stopwords with tqdm progress bar
stopwords = nltk.corpus.stopwords.words('english')
for i in tqdm(range(len(tweets['words']))):
    tweets['words'][i] = [word for word in tweets['words'][i] if word not in stopwords]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amirsolei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                                                           

In [45]:
# Stem words using PorterStemmer
stemmer = nltk.stem.PorterStemmer()
for i in tqdm(range(len(tweets['words']))):
    tweets['words'][i] = [stemmer.stem(word) for word in tweets['words'][i]]
tweets.head(3)

                                                           

Unnamed: 0,id,date,user,text,words
0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - awww, t...","[awww, bummer, shoulda, got, david, carr, thir..."
1,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his facebook by ...,"[upset, updat, facebook, text, might, cri, res..."
2,1467810917,2009-04-06 22:19:53,mattycus,@kenichan i dived many times for the ball. man...,"[dive, mani, time, ball, manag, save, 50%, res..."


### 2.3 Feature Extraction

In [84]:
# Copy tweets to a new dataframe
tweets_bof = tweets.copy()

from sklearn.feature_extraction.text import CountVectorizer

# Create a list of words from the words column
words = tweets_bof['words'].tolist()
words = [' '.join(word) for word in words]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(words)

Using the raw X for our bag of words needs allocation of `3.08TiB` for an array with shape `(1598315, 265119)`. This is too much for my computer to handle. Instead, we will use the preprocessed X.

In [85]:
# Find maximum element in X
print(X.max())
# Find the row responsible for the maximum element
X.argmax(axis=0)


43


matrix([[ 626286,  143187,   11844, ..., 1330587,  293167, 1381606]])

In [89]:
tweets_bof.iloc[143187, 3]

'awake with 1,000,000,000,000,000,000,000,000,000,000,000,000,000 things to do '

In [79]:
X[819633, 0]

0

In [81]:
vectorizer.get_feature_names_out()

array(['00', '000', '0000', ..., 'øºøµø', 'øºù', 'ù¾ø'], dtype=object)

In [83]:
# Choose the best 10000 features
vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(words)
len(vectorizer.get_feature_names_out())

10000