# Trump Rally Text Analysis
#### **Descriptions:** This is an NLP (Natural Language Processing) Analysis of Donald Trump Presidential Rally Speech. This analysis is intended for educational purposes only with no political intentions whatsoever.

### Author: Mohammad Adi Amirudin (amirudin.adi@gmail.com)
### Initial Date: Aug, 17th 2021
### Dataset: Trump Rally Speech from Kaggle (https://www.kaggle.com/christianlillelund/donald-trumps-rallies)
### Method: Time Series, NLP (Sentiment Analysis, Segmentation, Topic Modeling), EDA
### Tools: Ms.Visual Studio Code IDE (Python 3.9.6 on Jupyter Notebook)
### Interpreter & Package: miniconda | mininlp (pandas numpy matplotlib seaborn nltk scikit-learn notebook textblob gensim wordcloud levensthein)


# Import Libraries & Dataset

In [2]:
import nltk as nltk
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import spacy as sp

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF  
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error,silhouette_score
from gensim.parsing.preprocessing import remove_stopwords
from wordcloud import WordCloud
from wordcloud import WordCloud,STOPWORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import vader
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import word_tokenize
from scipy import stats
from heapq import nlargest

sns.set_style('darkgrid')

## Importing Trump Rally Speech Text Corpora

In [3]:
corpus = []          
for dirname, _, filenames in os.walk("Trump_Rally_Speech_Text_Corpora/"):
    for filename in filenames:
        with open((os.path.join(dirname, filename)), encoding="UTF-8") as file_input: corpus.append(file_input.read(),)

In [5]:
place_name = [i.replace(".txt"," ") for i in filenames]
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
place, month_list, date = [],[],[]
for name in place_name:
    index= -1
    for month in months:
        index =name.find(month)
        if index != -1:
            month_list.append(month)
            break
    place.append(name[:index])
    date.append(name[index+3:])

In [6]:
trump_df = pd.DataFrame({"Month":month_list,"Year":date,"Place":place,"Speech":corpus})
trump_df[['Day','Year']] = trump_df.Year.str.split("_",expand=True,)

In [7]:
trump_df["Place"] = trump_df["Place"].str.replace( r"([A-Z])", r" \1").str.strip()
trump_df["Place"] = trump_df["Place"].replace("Winston- Salem", "Winston-Salem")
trump_df.head(5)

  trump_df["Place"] = trump_df["Place"].str.replace( r"([A-Z])", r" \1").str.strip()


Unnamed: 0,Month,Year,Place,Speech,Day
0,Dec,2019,Battle Creek,Thank you. Thank you. Thank you to Vice Presid...,19
1,Sep,2020,Bemidji,There's a lot of people. That's great. Thank y...,18
2,Feb,2020,Charleston,Thank you. Thank you. Thank you. All I can say...,28
3,Mar,2020,Charlotte,"I want to thank you very much. North Carolina,...",2
4,Aug,2019,Cincinnati,Thank you all. Thank you very much. Thank you ...,1


In [8]:
trump_df["State"] = ['Mississippi','Minnesota','Ohio','Michigan','New Hampshire','Nevada','New Jersey','Texas','Iowa','North Carolina','Michigan','Kentucky','Wisconsin','Arizona',
                       'Oklahoma','Minnesota','New Hampshire','Pennsylvania','Colorado','Pennsylvania','Ohio','South Carolina','North Carolina','Nevada','North Carolina','New Hampshire',
                       'North Carolina','Ohio','Texas','Wisconsin','Nevada','South Carolina','New Mexico','Arizona','Pennsylvania']
trump_df.head(5)

Unnamed: 0,Month,Year,Place,Speech,Day,State
0,Dec,2019,Battle Creek,Thank you. Thank you. Thank you to Vice Presid...,19,Mississippi
1,Sep,2020,Bemidji,There's a lot of people. That's great. Thank y...,18,Minnesota
2,Feb,2020,Charleston,Thank you. Thank you. Thank you. All I can say...,28,Ohio
3,Mar,2020,Charlotte,"I want to thank you very much. North Carolina,...",2,Michigan
4,Aug,2019,Cincinnati,Thank you all. Thank you very much. Thank you ...,1,New Hampshire


## Importing States & Cities Selected Metrics

In [9]:
metrics1_df = pd.read_csv("other data.csv",keep_default_na=False, na_values=[""])
metrics1_df.head(5)

Unnamed: 0,Row Labels,State millenials population percentage (25 - 34),State late adult population percentage (55-64),State adult Tertiary Education Rate (25+ years old),State unemployment Rate (16+ years old),"State percentage of management, business, science, and arts workers (16+ years old)","State percentage of natural resources, construction, and maintenance workers (16+ years old)","State percentage of production, transportation, and material moving workers",State percentage of manufacturing workers (16+ years old),"State percentage of agriculture, forestry, fishing and hunting, and mining workers (16+ years old)","State percentage of professional, scientific, and management, and administrative and waste management services workers (16+ years old)","State percentage of finance and insurance, and real estate and rental and leasing workers (16+ years old)",State inflation-adjusted average household income (2019),State household percentage with cash public assistance as income(2019),States poverty rate (percent of population)
0,Alabama,13.0,13.3,29.8,4.9,35.9,9.4,17.9,14.5,1.2,9.6,5.6,51734.0,1.4,15.5
1,Alaska,16.1,13.0,34.6,5.8,38.3,11.4,13.4,4.2,4.9,8.9,4.3,75463.0,6.0,10.1
2,Arizona,13.8,12.1,33.7,5.1,37.9,9.5,11.2,7.0,1.4,12.5,8.7,62055.0,1.5,13.5
3,Arkansas,12.8,12.8,29.3,4.8,33.9,10.1,18.3,13.8,2.8,7.5,4.6,48952.0,2.2,16.2
4,California,15.3,12.1,28.4,5.1,40.7,9.0,11.9,8.7,2.1,14.2,5.8,80440.0,2.8,11.8


In [10]:
metrics2_df = pd.read_csv("Race Data.csv",keep_default_na=False, na_values=[""])
metrics2_df.head(5)

Unnamed: 0,State,WhiteTotalPerc,BlackTotalPerc,IndianTotalPerc,AsianTotalPerc,HawaiianTotalPerc,OtherTotalPerc
0,Alabama,0.6809,0.2664,0.0052,0.0136,0.0005,0.0334
1,Alaska,0.6458,0.0328,0.1489,0.0623,0.0125,0.0976
2,Arizona,0.7722,0.045,0.045,0.0331,0.0021,0.1026
3,Arkansas,0.7672,0.1532,0.0068,0.0152,0.0029,0.0547
4,California,0.597,0.0579,0.0077,0.1449,0.004,0.1885


## Combining The Dataset

In [11]:
merged_inner1 = pd.merge(left=trump_df, right=metrics1_df, left_on="State", right_on="Row Labels")
merged_inner1.head(5)


Unnamed: 0,Month,Year,Place,Speech,Day,State,Row Labels,State millenials population percentage (25 - 34),State late adult population percentage (55-64),State adult Tertiary Education Rate (25+ years old),...,"State percentage of management, business, science, and arts workers (16+ years old)","State percentage of natural resources, construction, and maintenance workers (16+ years old)","State percentage of production, transportation, and material moving workers",State percentage of manufacturing workers (16+ years old),"State percentage of agriculture, forestry, fishing and hunting, and mining workers (16+ years old)","State percentage of professional, scientific, and management, and administrative and waste management services workers (16+ years old)","State percentage of finance and insurance, and real estate and rental and leasing workers (16+ years old)",State inflation-adjusted average household income (2019),State household percentage with cash public assistance as income(2019),States poverty rate (percent of population)
0,Dec,2019,Battle Creek,Thank you. Thank you. Thank you to Vice Presid...,19,Mississippi,Mississippi,12.4,12.9,32.8,...,32.8,10.4,18.3,13.5,2.2,6.6,4.7,45792.0,1.8,19.6
1,Sep,2020,Bemidji,There's a lot of people. That's great. Thank y...,18,Minnesota,Minnesota,13.6,13.4,31.9,...,42.8,7.9,13.9,13.4,2.1,10.1,7.1,74593.0,3.0,9.0
2,Sep,2020,Latrobe,"So thank you Pennsylvania, very much. I'm thri...",3,Minnesota,Minnesota,13.6,13.4,31.9,...,42.8,7.9,13.9,13.4,2.1,10.1,7.1,74593.0,3.0,9.0
3,Feb,2020,Charleston,Thank you. Thank you. Thank you. All I can say...,28,Ohio,Ohio,13.2,13.7,28.9,...,38.1,7.6,16.9,15.3,0.9,9.8,6.2,58642.0,2.4,13.1
4,Sep,2020,Mosinee,"Thank you, thank you very much. Thank you very...",17,Ohio,Ohio,13.2,13.7,28.9,...,38.1,7.6,16.9,15.3,0.9,9.8,6.2,58642.0,2.4,13.1


In [12]:
merged_inner2 = pd.merge(left=merged_inner1, right=metrics2_df, left_on="State", right_on="State")
rally_df = merged_inner2.drop("Row Labels", axis = 1)
rally_df.head(5)

Unnamed: 0,Month,Year,Place,Speech,Day,State,State millenials population percentage (25 - 34),State late adult population percentage (55-64),State adult Tertiary Education Rate (25+ years old),State unemployment Rate (16+ years old),...,"State percentage of finance and insurance, and real estate and rental and leasing workers (16+ years old)",State inflation-adjusted average household income (2019),State household percentage with cash public assistance as income(2019),States poverty rate (percent of population),WhiteTotalPerc,BlackTotalPerc,IndianTotalPerc,AsianTotalPerc,HawaiianTotalPerc,OtherTotalPerc
0,Dec,2019,Battle Creek,Thank you. Thank you. Thank you to Vice Presid...,19,Mississippi,12.4,12.9,32.8,6.6,...,4.7,45792.0,1.8,19.6,0.5841,0.3772,0.0048,0.0099,0.0002,0.0238
1,Sep,2020,Bemidji,There's a lot of people. That's great. Thank y...,18,Minnesota,13.6,13.4,31.9,3.2,...,7.1,74593.0,3.0,9.0,0.8285,0.0641,0.0104,0.0482,0.0004,0.0484
2,Sep,2020,Latrobe,"So thank you Pennsylvania, very much. I'm thri...",3,Minnesota,13.6,13.4,31.9,3.2,...,7.1,74593.0,3.0,9.0,0.8285,0.0641,0.0104,0.0482,0.0004,0.0484
3,Feb,2020,Charleston,Thank you. Thank you. Thank you. All I can say...,28,Ohio,13.2,13.7,28.9,4.6,...,6.2,58642.0,2.4,13.1,0.813,0.1241,0.002,0.0222,0.0003,0.0385
4,Sep,2020,Mosinee,"Thank you, thank you very much. Thank you very...",17,Ohio,13.2,13.7,28.9,4.6,...,6.2,58642.0,2.4,13.1,0.813,0.1241,0.002,0.0222,0.0003,0.0385


# Extracting Text Features & Dataset Preprocessing

## Preprocessing:

> Stopwords removal

> Tokenization

> Regular Expression

## Tokenization

In [13]:
rally_df["Speech"] = rally_df["Speech"].apply(lambda x: re.sub(r'\[.*?\]','',x))
rally_df["Speech"] = rally_df["Speech"].apply(lambda x: x.lower())
tokenizer = RegexpTokenizer("[a-z][a-z]+[a-z]")

In [14]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith("J"):
        return wordnet.ADJ
    elif nltk_tag.startswith("V"):
        return wordnet.VERB
    elif nltk_tag.startswith("N"):
        return wordnet.NOUN
    elif nltk_tag.startswith("R"):
        return wordnet.ADV
    else:          
        return None

In [15]:
def lemmatize_sentence(sentence):
     tokenizer = RegexpTokenizer('[a-z][a-z]+[a-z]')
     nltk_tagged = nltk.pos_tag(tokenizer.tokenize(sentence))  
     wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
     lemmatized_sentence = []
     for word, tag in wordnet_tagged:
         if tag is None:
             lemmatized_sentence.append(word)
         else:        
             lemmatized_sentence.append(WordNetLemmatizer().lemmatize(word, tag))
     return " ".join(lemmatized_sentence)

In [16]:
def lexical_diversity(text):
    return (len(text) / len(set(text))/10000)

In [27]:
rally_df["Speech"] = rally_df["Speech"].apply(lambda x: lemmatize_sentence(x))

## Stopword Removal

In [34]:
rally_df["Speech_No_Stopwords"] = rally_df["Speech"].apply(lambda x: remove_stopwords(x))

## Text Feature Extractions

In [35]:
rally_df["Numbers Of Words"] = rally_df["Speech"].apply(lambda x: len(x.split(" ")))
rally_df["Numbers Of StopWords"] = rally_df["Speech"].apply(lambda x: len([word for word in x.split(" ") if word in list(STOPWORDS)]))
rally_df["Numbers Of Sentences"] = rally_df["Speech"].apply(lambda x: len(re.findall("\.",x)))
rally_df["Average Word Length"] = rally_df["Speech"].apply(lambda x: np.mean(np.array([len(va) for va in x.split(" ") if va not in list(STOPWORDS)])))
rally_df["Speech"] = rally_df["Speech"].apply(lambda x: re.sub(r"[,.;@#?!&$]+", " ", x))
rally_df["Lexical Diversity"] = rally_df["Speech"].apply(lambda x: lexical_diversity(x))

## Sentiment Analysis

In [36]:
sid = SentimentIntensityAnalyzer()
rally_df["Sentiments"] = rally_df["Speech_No_Stopwords"].apply(lambda x: sid.polarity_scores(x))
rally_df["Positive Sentiment"] = rally_df["Sentiments"].apply(lambda x: x["pos"]) 
rally_df["Neutral Sentiment"] = rally_df["Sentiments"].apply(lambda x: x["neu"])
rally_df["Negative Sentiment"] = rally_df["Sentiments"].apply(lambda x: x["neg"])
rally_df.drop(columns=["Sentiments"],inplace=True)

## Misc Data Extractions

In [40]:
nlpspacy = sp.load("en_core_web_sm")
rally_df["Numbers of Different Countries Mentioned"] = rally_df["Speech"].apply(lambda x: len([tok for tok in cool_extractor(x).ents if tok.label_ == "GPE" ]))
rally_df["Numbers of Times Money Was Mentioned"] = rally_df["Speech"].apply(lambda x: len([tok for tok in cool_extractor(x).ents if tok.label_ == "MONEY" ]))
rally_df["Numbers of Different People Mentioned"] = rally_df["Speech"].apply(lambda x: len([tok for tok in cool_extractor(x).ents if tok.label_ == "PERSON" ]))

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.