In [2]:
from time import time
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx


## Parte 1: Cargado y Limpieza de datos

In [3]:
df_speeches = pd.read_csv('../data/us_2020_election_speeches.csv')
df_speeches

Unnamed: 0,speaker,title,text,date,location,type
0,David Perdue,Georgia Sen. David Perdue Speech Transcript at...,David Perdue: (00:01)\nHow great is it to be b...,"Oct 16, 2020","Macon, Georgia",Campaign Speech
1,Joe Biden,"Joe Biden Southfield, MI Speech on Health Care...","Joe Biden: (00:00)\nHello, Michigan. Hi, how a...","Oct 16, 2020","Southfield ,Michigan",Campaign Speech
2,Donald Trump,Donald Trump Speech Transcript ‘Protecting Ame...,President Trump: (00:30)\nThank you. What a ni...,"Oct 16, 2020","Fort Myers, Florida",Campaign Speech
3,Joe Biden,Joe Biden ABC Town Hall Transcript October 15,"George Stephanopoulos: (00:41)\nHey, and welco...","Oct 15, 2020",ABC,Town Hall
4,Donald Trump,Donald Trump NBC Town Hall Transcript October 15,Savannah Guthrie: (03:50)\nIt’s nothing but no...,"Oct 15, 2020",NBC,Town Hall
...,...,...,...,...,...,...
264,Bernie Sanders,Bernie Sanders Speech Transcript: Sanders Spea...,Bernie Sanders: (00:00)\nJust want to take thi...,"Feb 6, 2020",Iowa,Campaign Speech
265,Democratic Candidates,Transcript: Speeches at the Iowa Caucuses – Be...,Bernie Sanders: (00:08)\nThank you. Thank you....,"Feb 4, 2020",Iowa,Campaign Speech
266,Donald Trump,Donal Trump Iowa Rally Transcript: Trump Holds...,Donald Trump: (00:24)\nI worked so hard for th...,"Jan 30, 2020","Des Moines, Iowa",Campaign Speech
267,Donald Trump,Donald Trump New Jersey Rally Speech Transcrip...,Donald Trump: (01:22)\nThank you. Thank you. I...,"Jan 28, 2020","Wildwood, New Jersey",Campaign Speech


# A) Datos faltantes

Hacemos uso de las funciones len() y el atributo count() para obtener los datos faltantes por variable en el dataframe df_speeches

In [8]:
n = len(df_speeches)
count = df_speeches.count()
n-count

speaker      3
title        0
text         0
date         0
location    18
type        21
dtype: int64

In [None]:
# select numerical columns
df_speeches.dtypes



speaker     object
title       object
text        object
date        object
location    object
type        object
dtype: object

In [6]:
def get_var_category(series):
    unique_count = series.nunique(dropna=False)
    total_count = len(series)
    if pd.api.types.is_numeric_dtype(series):
        return 'Numerical'
    elif pd.api.types.is_datetime64_dtype(series):
        return 'Date'
    elif unique_count==total_count:
        return 'Text (Unique)'
    else:
        return 'Categorical'

def print_categories(df):
    for column_name in df.columns:
        print(column_name, ": ", get_var_category(df[column_name]))

print_categories(df_speeches)

speaker :  Categorical
title :  Text (Unique)
text :  Text (Unique)
date :  Categorical
location :  Categorical
type :  Categorical


In [16]:
df_speeches.describe()

Unnamed: 0,speaker,title,text,date,location,type
count,266,269,269,269,251,248
unique,71,269,269,123,102,9
top,Joe Biden,January Iowa Democratic Debate Transcript,"Wolf Blitzer: (00:00)\nAll right, so let’s beg...","Aug 27, 2020",Virtual,Campaign Speech
freq,71,1,1,9,62,180


In [25]:
# how many types and obs are?
df_speeches["type"].value_counts()

type
Campaign Speech       180
Town Hall              18
Interview              14
Debate                  9
Endorsement             8
Statement               8
Roundtable              8
Press Conference        2
Voter Mobilization      1
Name: count, dtype: int64

In [33]:
# filter campaign speeches
speeches = df_speeches[df_speeches["type"] == "Campaign Speech"]


In [34]:
# group speeches by speaker
speeches["speaker"].value_counts()


speaker
Joe Biden                   43
Donald Trump                41
Mike Pence                  17
Bernie Sanders               9
Kamala Harris                7
Multiple Speakers            5
Democratic Candidates        4
Jill Biden                   2
Pete Buttigieg               2
David Perdue                 1
Tom Cotton                   1
Rudy Giuliani                1
Ben Carson                   1
Mitch McConnell              1
Chen Guangcheng              1
Lara Trump                   1
Lou Holtz                    1
Karen Pence                  1
Jack Brewer                  1
Kellyanne Conway             1
Kayleigh McEnany             1
Dan Crenshaw                 1
Pam Bondi                    1
Melania Trump                1
Joe Biden,Kamala Harris      1
Ivanka Trump                 1
Eric Trump                   1
Mike Pompeo                  1
Rand Paul                    1
Nicholas Sandmann            1
Kimberly Guilfoyle           1
Herschel Walker              1


In [38]:
# filter the first 5
top5_speakers = speeches["speaker"].value_counts().head(5).index

top_speeches = speeches[speeches["speaker"].isin(top5_speakers)]
