# Cape Town, South Africa Analysis of Twitter posts

#### Ajayi Olabode

### Abstract

This is a simple program that analysis the Cape Town, South Africa tweets posts. We grouped the data based on location and the program then use the location to find the most frequency words users used on Twitter. Furthermore, we can used the analysis to find users ID and we predict the different emotions attached to the text. However, in this report we consider using the `user id`,`location`,`text`. 

#### Used Python Libraries
This program data was pre-processed using pandas, NLTK, gensim, and numpy libraries within the Python virtual environment. Plots were created using plotly.

In [1]:
# -*- coding: utf-8 -*-
#!/usr/bin/python
from collections import Counter
import nltk
import pandas as pd
from emoticons import EmoticonDetector
import re as regex
import numpy as np
import plotly
from plotly import graph_objs
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from time import time
import gensim

# plotly configuration
plotly.offline.init_notebook_mode()

ImportError: No module named emoticons

In [26]:
class TwitterData_Initialize():
    data = []
    processed_data = []
    wordlist = []

    data_model = None
    data_labels = None
    is_testing = False
    
    def initialize(self, csv_file, is_testing_set=False, from_cached=None):
        if from_cached is not None:
            self.data_model = pd.read_csv(from_cached)
            return

        self.is_testing = is_testing_set

        if not is_testing_set:
            self.data = pd.read_csv(csv_file, header=0, names=["id", "location", "text"])

        else:
            self.data = pd.read_csv(csv_file, header=0, names=["id", "text"],dtype={"id":"int64","text":"str"},nrows=4000)
            not_null_text = 1 ^ pd.isnull(self.data["text"])
            not_null_id = 1 ^ pd.isnull(self.data["id"])
            self.data = self.data.loc[not_null_id & not_null_text, :]

        self.processed_data = self.data
        self.wordlist = []
        self.data_model = None
        self.data_labels = None

In [27]:
data = TwitterData_Initialize()
data.initialize("/Users/boratonaj/Desktop/Data_Science/Assessment/CT_Tweets_Analysis_Report/data/subset.csv")
data.processed_data.head(10)

Unnamed: 0,id,location,text
0,8.55e+17,South Africa,RT @ZackieAchmat: #UitmetZille #UitMetZuma - a...
1,8.55e+17,South Africa,RT @ZackieAchmat: #UitmetZille #UitMetZuma - a...
2,8.55e+17,South Africa,RT @ReclaimCT: Tonight @ReclaimCT came for sup...
3,8.55e+17,South Africa,RT @ReclaimCT: Tonight @ReclaimCT came for sup...
4,8.55e+17,cape town,The demographics in that room (beyond Reclaim ...
5,8.55e+17,cape town,The demographics in that room (beyond Reclaim ...
6,8.55e+17,"Cape Town, South Africa",@ReclaimCT ...refer the part \rI prefer to liv...
7,8.55e+17,"Cape Town, South Africa",@ReclaimCT ...refer the part \rI prefer to liv...
8,8.55e+17,"Cape Town, South Africa",RT @ReclaimCT: Tonight @ReclaimCT came for sup...
9,8.55e+17,"Cape Town, South Africa",RT @ReclaimCT: Tonight @ReclaimCT came for sup...


In [28]:
class TwitterCleanuper:
    def iterate(self):
        for cleanup_method in [self.remove_urls,
                               self.remove_usernames,
                               self.remove_na,
                               self.remove_special_chars,
                               self.remove_numbers]:
            yield cleanup_method

    @staticmethod
    
    def remove_by_regex(tweets, regexp):
        tweets.loc[:, "text"].replace(regexp, "", inplace=True)
        return tweets

    def remove_urls(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

    def remove_na(self, tweets):
        return tweets[tweets["text"] != "Not Available"]

    def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#"]):
            tweets.loc[:, "text"].replace(remove, "", inplace=True)
        return tweets

    def remove_usernames(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

    def remove_numbers(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))


In [29]:
class TwitterData_Cleansing(TwitterData_Initialize):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
    def cleanup(self, cleanuper):
        t = self.processed_data
        for cleanup_method in cleanuper.iterate():
            if not self.is_testing:
                t = cleanup_method(t)
            else:
                if cleanup_method.__name__ != "remove_na":
                    t = cleanup_method(t)

        self.processed_data = t

In [30]:
data = TwitterData_Cleansing(data)
data.cleanup(TwitterCleanuper())
data.processed_data.head(30)

Unnamed: 0,id,location,text
0,8.55e+17,South Africa,RT UitmetZille UitMetZuma as speaks to an all...
1,8.55e+17,South Africa,RT UitmetZille UitMetZuma as speaks to an all...
2,8.55e+17,South Africa,RT Tonight came for supper with We done with t...
3,8.55e+17,South Africa,RT Tonight came for supper with We done with t...
4,8.55e+17,cape town,The demographics in that room beyond Reclaim T...
5,8.55e+17,cape town,The demographics in that room beyond Reclaim T...
6,8.55e+17,"Cape Town, South Africa",refer the part \rI prefer to live where I like
7,8.55e+17,"Cape Town, South Africa",refer the part \rI prefer to live where I like
8,8.55e+17,"Cape Town, South Africa",RT Tonight came for supper with We done with t...
9,8.55e+17,"Cape Town, South Africa",RT Tonight came for supper with We done with t...


In [33]:
import re
class TwitterData_TokenStem(TwitterData_Cleansing):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
    emoticons_str = r"""
        (?:
            [:=;] # Eyes
            [oO\-]? # Nose (optional)
            [D\)\]\(\]/\\OpP] # Mouth
        )"""

    regex_str = [
        emoticons_str,
        r'<[^>]+>', # HTML tags
        r'(?:@[\w_]+)', # @-mentions
        r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
        r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

        r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
        r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
        r'(?:[\w_]+)', # other words
        r'(?:\S)' # anything else
    ]

    tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
    emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
    

    def stem(self, stemmer=nltk.PorterStemmer()):
        def stem_and_join(row):
            row["text"] = list(map(lambda str: stemmer.stem(str.lower()), row["text"]))
            return row

        self.processed_data = self.processed_data.apply(stem_and_join, axis=1)

    def tokenize(self, tokenizer=nltk.word_tokenize):
        def tokenize_row(row):
            row["text"] = tokenizer(row["text"])
            row["tokenized_text"] = [] + row["text"]
            return row

        self.processed_data = self.processed_data.apply(tokenize_row, axis=1)

    def preprocess(s, lowercase=False):
        tokens = tokenize(processed_data)
        if lowercase:
            tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
        return tokens
        

In [35]:
data = TwitterData_TokenStem(data)
data.tokenize()
#data.stem()
data.processed_data.head(5)

TypeError: ('expected string or buffer', u'occurred at index 0')

In [10]:
def most_words():
    words = Counter()
    for idx in data.processed_data.index:
        words.update(data.processed_data.loc[idx, "text"])
    return words.most_common(50)
most_words()

[(' ', 8344),
 ('e', 4604),
 ('a', 3472),
 ('t', 3419),
 ('i', 2928),
 ('o', 2768),
 ('s', 2646),
 ('n', 2550),
 ('r', 1886),
 ('l', 1822),
 ('h', 1722),
 ('d', 1288),
 ('c', 1222),
 ('u', 1112),
 ('p', 1101),
 ('m', 920),
 ('g', 903),
 ('T', 801),
 ('_', 721),
 ('f', 654),
 ('y', 628),
 ('w', 606),
 ('R', 570),
 ('b', 549),
 ('\xd1', 348),
 ('v', 321),
 ('k', 273),
 ('S', 228),
 ('A', 215),
 ('C', 206),
 ('W', 168),
 ('I', 136),
 ('P', 118),
 ('Z', 96),
 ('E', 95),
 ('B', 93),
 ('F', 82),
 ('G', 78),
 ('M', 77),
 ('H', 73),
 ('\xaa', 69),
 ('K', 68),
 ('\xe9', 66),
 ('x', 63),
 ('U', 62),
 ('\r', 57),
 ('L', 57),
 ('j', 56),
 ('D', 54),
 ('N', 53)]

In [11]:
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']


The most commont words (as expected) are the typical english stopwords. We will filter them out, however, as purpose of this analysis is to determine sentiment, words like "cape town" and "south africa" can influence it greatly. Having this in mind, this word will be whitelisted.Try calling read_csv with encoding='latin1', encoding='iso-8859-1' or encoding='cp1252'; these the various encodings found on Windows.

In [12]:
stopwords=nltk.corpus.stopwords.words("english")
whitelist = ["Cape Town", "South Africa"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]
words.most_common(5)

NameError: name 'words' is not defined

In [13]:
class TwitterData_Wordlist(TwitterData_TokenStem):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    whitelist = ["Cape Town", "South Africa"]
    wordlist = []
        
    def build_wordlist(self, min_occurrences=3, max_occurences=500, stopwords=nltk.corpus.stopwords.words("english"),
                       whitelist=None):
        self.wordlist = []
        whitelist = self.whitelist if whitelist is None else whitelist
        import os
        if os.path.isfile("/Users/boratonaj/Desktop/Data_Science/Assessment/CT_Tweets_Analysis_Report/data/wordlist.csv"):
            word_df = pd.read_csv("/Users/boratonaj/Desktop/Data_Science/Assessment/CT_Tweets_Analysis_Report/data/wordlist.csv")
            word_df = word_df[word_df["occurrences"] > min_occurrences]
            self.wordlist = list(word_df.loc[:, "word"])
            return

        words = Counter()
        for idx in self.processed_data.index:
            words.update(self.processed_data.loc[idx, "text"])

        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]

        word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                     "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                               columns=["word", "occurrences"])

        word_df.to_csv("/Users/boratonaj/Desktop/Data_Science/Assessment/CT_Tweets_Analysis_Report/data/wordlist.csv", index_label="idx")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]

NameError: name 'TwitterData_TokenStem' is not defined

In [303]:
data = TwitterData_Wordlist(data)
data.build_wordlist()

In [304]:
words = pd.read_csv("/Users/boratonaj/Desktop/Data_Science/Assessment/CT_Tweets_Analysis_Report/data/wordlist.csv",encoding='latin1')
x_words = list(words.loc[0:10,"word"])
x_words.reverse()
y_occ = list(words.loc[0:10,"occurrences"])
y_occ.reverse()

dist = [
    graph_objs.Bar(
        x=y_occ,
        y=x_words,
        orientation="h"
)]
plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Top words in built wordlist")})

In [305]:
class TwitterData_BagOfWords(TwitterData_Wordlist):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        self.wordlist = previous.wordlist
    
    def build_data_model(self):
        label_column = []
        if not self.is_testing:
            label_column = ["label"]

        columns = label_column + list(
            map(lambda w: w + "_cpt",self.wordlist))
        labels = []
        rows = []
        for idx in self.processed_data.index:
            current_row = []

            if not self.is_testing:
                # add label
                current_label = self.processed_data.loc[idx, "location"]
                labels.append(current_label)
                current_row.append(current_label)

            # add bag-of-words
            tokens = set(self.processed_data.loc[idx, "text"])
            for _, word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)

            rows.append(current_row)

        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        return self.data_model, self.data_labels

In [306]:
data = TwitterData_BagOfWords(data)
bow, labels = data.build_data_model()
bow.head(5)

Unnamed: 0,label,RT_cpt,Tafelberg_cpt,amp_cpt,Khayelitsha_cpt,Cape_cpt,The_cpt,apartheid_cpt,https_�__cpt,movement_cpt,...,rising_cpt,inner_cpt,raising_cpt,Africa_cpt,ur_cpt,agenda_cpt,fabricating_cpt,deserve_cpt,daily_cpt,push_cpt
0,South Africa,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,South Africa,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,South Africa,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,South Africa,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,cape town,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [307]:
grouped = bow.groupby(["label"]).sum()
words_to_visualize = []
sentiments = ["South Africa","Cape Town, South Africa","Cape Town"]
#get the most 7 common words for every sentiment
for sentiment in sentiments:
    words = grouped.loc[sentiment,:]
    words.sort_values(inplace=True,ascending=False)
    for w in words.index[:7]:
        if w not in words_to_visualize:
            words_to_visualize.append(w)
            
            
#visualize it
plot_data = []
for sentiment in sentiments:
    plot_data.append(graph_objs.Bar(
            x = [w.split("_")[0] for w in words_to_visualize],
            y = [grouped.loc[sentiment,w] for w in words_to_visualize],
            name = sentiment
    ))
    
plotly.offline.iplot({
        "data":plot_data,
        "layout":graph_objs.Layout(title="Most common words across sentiments")
    })