In [2]:
# Step 0. Load libraries and custom modules
# Dataframes and matrices ----------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
plt.style.use('tableau-colorblind10')
# Mathematical functions -----------------------------------------------
from scipy.stats import norm
# Text processors ------------------------------------------------------
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
# Preprocessing --------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Text modeling --------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Model creating -------------------------------------------------------
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
# Metrics --------------------------------------------------------------
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import make_scorer
# Custom functions -----------------------------------------------------
from text_preprocessing import clean_stopwords 

In [6]:
# We'll use a collection of sentiments for text analysis as a dataset
# This dataset was published in Saif M. Mohammad and Peter Turney. (2013), 
# ``Crowdsourcing a Word-Emotion Association Lexicon.'' 
# Computational Intelligence, 29(3): 436-465.
# It's only for research and educational purposes.
# URL: http://saifmohammad.com/WebPages/lexicons.html  
nrc = pd.read_csv('../data/raw/NRC.csv', names=['word','sentiment','polarity'])
nrc = nrc.query('polarity == 1')

In [7]:
nrc

Unnamed: 0,word,sentiment,polarity
19,abacus,trust,1
23,abandon,fear,1
25,abandon,negative,1
27,abandon,sadness,1
30,abandoned,anger,1
...,...,...,...
141461,zest,anticipation,1
141464,zest,joy,1
141466,zest,positive,1
141469,zest,trust,1


In [4]:
# Step 1. Load the data
# Data extracted from https://www.thetrumparchive.com
# Data case: During the 2016 US presidential election, the candidate 
# Donald Trump used twitter to communicate with potential voters. 
# The campaign was during 2015-06-17 and 2016-11-08
# We'll try to analyze these campaign tweets from iPhone and Android
# Data description
# source -> device of origin
# id_str -> unique identifier
# text -> tweet text content
# created_at -> Date of creation, not including timezone
# retweet_count -> Count of retweets (difusion)
# in_reply_to_usr_id_str -> If it's a reply, grab the user id
# favourite_count -> Count of users that liked the tweet
# is_retweet -> If the post is a retweet  
# 1.1 Open data and get a glimpse
df_raw = pd.read_csv('../data/raw/trump_tweets.csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20761 entries, 0 to 20760
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   source                   20761 non-null  object 
 1   id_str                   20761 non-null  int64  
 2   text                     20761 non-null  object 
 3   created_at               20761 non-null  object 
 4   retweet_count            20761 non-null  int64  
 5   in_reply_to_user_id_str  2442 non-null   float64
 6   favorite_count           20761 non-null  int64  
 7   is_retweet               20761 non-null  bool   
dtypes: bool(1), float64(1), int64(3), object(3)
memory usage: 1.1+ MB


In [5]:
# 1.1 Sample some observations
df_raw.sample(10)

Unnamed: 0,source,id_str,text,created_at,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet
12471,Twitter for iPhone,671330229596659712,"How is Chris Christie running the state of NJ,...",2015-11-30T14:09:34Z,3298,,4655,False
13260,Twitter for Android,640133656393195520,"The Dallas event in two weeks, at the American...",2015-09-05T12:05:31Z,1579,,1845,False
2881,Twitter Web Client,233915014099132416,All recent Presidents have released their tran...,2012-08-10T13:17:39Z,328,,52,False
9104,Twitter Web Client,296715825891659776,Oh the wonders of the Arab Spring. Our new all...,2013-01-30T20:25:40Z,237,,52,False
7885,Twitter Web Client,313735229430964227,Vattenfall CEO stated that the company needed ...,2013-03-18T19:34:42Z,17,,4,False
8672,Twitter Web Client,302063403407650817,It is important to think positively. Negative ...,2013-02-14T14:35:02Z,1376,,564,False
15223,Twitter for Android,803434300846862336,.@CNN is so embarrassed by their total (100%)...,2016-11-29T03:03:59Z,25140,,91706,False
6430,Twitter Web Client,344822631507324930,@scottymcd1980 A round of golf compliments o...,2013-06-12T14:24:56Z,7,234946101.0,11,False
7755,Twitter Web Client,314771578850275329,Remember the golden rule of negotiating: He wh...,2013-03-21T16:12:47Z,715,,325,False
5283,Twitter for Android,376482993353019392,@dexterpugh Great looking couple-good to hav...,2013-09-07T23:11:54Z,0,392081766.0,1,False


In [None]:
# Step 2. Transform and wrangle the data
# 2.1 Make a copy
df_interim = df_raw.copy()

In [None]:
# 2.2 Get rid of uninformative columns
df_interim = df_interim.drop(['id_str','is_retweet','in_reply_to_user_id_str'], axis=1)

In [None]:
# 2.3 Convert columns to the right format
df_interim['created_at'] = df_interim['created_at'].astype('datetime64')
df_interim['source'] = pd.Categorical(df_interim['source'])

In [None]:
# 2.4 Filter dates for analysis
df_interim = df_interim.loc[(df_interim['created_at'] >= '2015-06-17') \
    & (df_interim['created_at'] <= '2016-11-08')]

In [None]:
# 2.5 Consider time is UTC, convert to EST


In [None]:
# 2.5 Filter source for analysis
df_interim = df_interim.loc[(df_interim['source'].str.contains('iPhone')) \
    | (df_interim['source'].str.contains('Android'))]
df_interim['source'] = df_interim['source'].cat.remove_unused_categories()

In [None]:
# 2.5 Save and create a copy for analysis
df_interim.to_csv('data/interim/trump_tweets.csv', index=False)
df = df_interim.copy()

In [None]:
# Step 3. Perform EDA
# 3.1 Get basic info
df.info()

In [None]:
# 3.2 Get a sample
df.sample(10)

In [None]:
# 3.3 Describe numerical and datetime data
df.describe(datetime_is_numeric=True)

In [None]:
# 3.4 Describe categorical data
df['source'].value_counts()

In [None]:
# 3.5 Get histograms for numerical data

In [None]:
# 3.6 Make a plot of tweets frequence rate by source

In [None]:
# 3.7 Get a glimpse of the most retweeted tweets

In [None]:
# 3.8 Get a glimpse of the most liked tweets

In [None]:
# 3.9 Let's get a glimpse of common words in the tweets' text

In [None]:
# 3.10 Process text to extract stopwords

In [None]:
# 3.11 Extract urls
url_pat = 'https://t.co/[A-Za-z\d]+|&amp;'


In [None]:
# 3.12 Extract special characters


In [None]:
# 3.13 Extract numbers


In [None]:
# 3.14 See the results


In [None]:
# 3.15 Let's see a wordcloud
