## Cleaning Data

In [22]:
import pandas as pd 

### President and Congress 1975 through 2010

In [23]:
# loading url of party control 1933-2010 from southeast missoury state univerity
url_semo = 'http://cstl-cla.semo.edu/rdrenka/ui320-75/presandcongress.asp'
path_2019 = '../Input/party_data.xlsx'

In [24]:
party_2019 = pd.read_excel(path_2019)

In [25]:
party_2019

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,term_start,term_end
23,1975-76,Ford,R,291,144,0,61,38,1,divided,1975,1976
24,1977-78,Carter,D,292,143,0,62,38,0,unified,1977,1978
25,1979-80,Carter,D,277,158,0,59,41,0,unified,1979,1980
26,1981-82,Reagan,R,243,192,0,47,53,0,divided (HR),1981,1982
27,1983-84,Reagan,R,269,166,0,46,54,0,divided (HR),1983,1984
28,1985-86,Reagan,R,253,182,0,47,53,0,divided (HR),1985,1986
29,1987-88,Reagan,R,258,177,0,55,45,0,divided,1987,1988
30,1989-90,"Bush, GHW",R,260,175,0,55,45,0,divided,1989,1990
31,1991-92,Bush,R,267,167,1,56,44,0,divided,1991,1992
32,1993-94,Clinton,D,258,176,1,57,43,0,unified,1993,1994


In [26]:
# Creating a dataframe 
list_semo = pd.read_html(url_semo)

# DataFrame
df = list_semo[0]

In [27]:
list_del = [0,1,41,42]

In [28]:
# Create a list of range 2 to 22 inclusive 
list_range = list(range(2,23))

In [29]:
# extend first list to the range list
list_del.extend(list_range)

In [30]:
# drop the indices in our dataframe using list_del list 
party_shutdown_df = df.drop(df.index[list_del])

In [31]:
party_with_headers = party_shutdown_df.rename(columns={
    0:"years",1:"president",2:"party_president",4:"house_dem_seats",5:"house_rep_seats",6:"house_other_seats",
    7:"senate_dem_seats",8:"senate_rep_seats",9:"senate_other_seats",10:"gov_status"
})

# delete column with header '3'
del party_with_headers[3]

In [32]:
# resetting dataframe index 
party_with_headers.reset_index(drop=True).head(3)

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified


In [33]:
# checking if there is any NaN value left in the dataframe 
party_with_headers.isnull().values.any()

False

In [34]:
# looking up types of columns in the dataframe
party_with_headers.dtypes

years                  object
president              object
party_president        object
house_dem_seats        object
house_rep_seats        object
house_other_seats      object
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

In [35]:
# create a copy of dataframe to change types
party_df_types = party_with_headers

In [36]:
# Converting three series to numerics
party_df_types['house_dem_seats'] = pd.to_numeric(party_df_types['house_dem_seats'])
party_df_types['house_rep_seats'] = pd.to_numeric(party_df_types['house_rep_seats'])
party_df_types['house_other_seats'] = pd.to_numeric(party_df_types['house_other_seats'])

In [37]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

In [38]:
# Appending of the two dataframes 
party_complete = party_df_types.append(party_2019)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [39]:
party_complete.reset_index(drop=True)

Unnamed: 0,gov_status,house_dem_seats,house_other_seats,house_rep_seats,party_president,president,senate_dem_seats,senate_other_seats,senate_rep_seats,term_end,term_start,years
0,divided,291,0,144,R,Ford,61.0,1.0,38,,,1975-76
1,unified,292,0,143,D,Carter,62.0,0.0,38,,,1977-78
2,unified,277,0,158,D,Carter,59.0,0.0,41,,,1979-80
3,divided (HR),243,0,192,R,Reagan,47.0,0.0,53,,,1981-82
4,divided (HR),269,0,166,R,Reagan,46.0,0.0,54,,,1983-84
5,divided (HR),253,0,182,R,Reagan,47.0,0.0,53,,,1985-86
6,divided,258,0,177,R,Reagan,55.0,0.0,45,,,1987-88
7,divided,260,0,175,R,"Bush, GHW",55.0,0.0,45,,,1989-90
8,divided,267,1,167,R,Bush,56.0,0.0,44,,,1991-92
9,unified,258,1,176,D,Clinton,57.0,0.0,43,,,1993-94


##### 'senate_rep_seats' cannot be changed to float or int yet because of the value '49*'

In [40]:
# using .loc to locate the "49*" value and changing it to '49'
party_df_types['senate_rep_seats'].loc[party_df_types['senate_rep_seats'] == "49*"] = '49'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [41]:
party_df_types['senate_rep_seats'] = pd.to_numeric(party_df_types['senate_rep_seats'])

In [42]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats        int64
senate_other_seats    float64
gov_status             object
dtype: object

In [43]:
# resetting index again just because anaconda is confused
party_df_types.reset_index(drop=True).head(3)

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified


In [44]:
party_df_types["term_start"] = party_df_types["years"].str.slice(0,4,1)
party_df_types

party_df_types["term_end"] = party_df_types["years"].str.slice(0,2,1) + party_df_types["years"].str.slice(5,7,1)
party_df_types.head(3)

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,term_start,term_end
23,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided,1975,1976
24,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified,1977,1978
25,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified,1979,1980


In [45]:
# Saving to Excel file
writer = pd.ExcelWriter('output.xlsx')
party_complete.to_excel(writer, index=False,header=True)
writer.save()

### Cleaning President Trump Approval Ratings

In [46]:
# url for President Trump approval ratings 
url_trump_gallup = "https://news.gallup.com/poll/203198/presidential-approval-ratings-donald-trump.aspx"

In [47]:
# Read the url
list_trump_gallup = pd.read_html(url_trump_gallup)

In [48]:
# save the section in dataframe to clean 
trump_not_clean = list_trump_gallup[2]
list_trump_del = [0,102]
# getting rid of second row and last
trump_clean = trump_not_clean.drop(trump_not_clean.index[list_trump_del])

In [49]:
# drop one level of the multi index column headers
trump_clean.columns = trump_clean.columns.droplevel()

In [50]:
# Column names
trump_clean.columns

Index(['Unnamed: 0_level_1', '%', '%', '%'], dtype='object')

In [51]:
trump_clean.head(2)

Unnamed: 0,Unnamed: 0_level_1,%,%.1,%.2
1,2019 Jan 2-10,37.0,59.0,4.0
2,2018,,,


In [52]:
# Renaming column headers
trump_cleaner = trump_clean.rename(columns={
    'Unnamed: 0_level_1': 'weekly', '%':'approve','%':'disapprove','%':'no_opinion'
})

trump_cleaner.reset_index(drop=True).head(3)

Unnamed: 0,weekly,no_opinion,no_opinion.1,no_opinion.2
0,2019 Jan 2-10,37.0,59.0,4.0
1,2018,,,
2,2018 Dec 17-22,39.0,55.0,5.0


In [53]:
# setting column names by index
trump_cleaner.columns.values[1] = 'approve_trump'
trump_cleaner.columns.values[2] = 'disapprove_trump'
trump_cleaner.columns.values[3] = 'no_opinion_trump'

In [54]:
# numerizing approval ratings' columns
trump_cleaner['approve_trump'] = pd.to_numeric(trump_cleaner['approve_trump'])
trump_cleaner['disapprove_trump'] = pd.to_numeric(trump_cleaner['disapprove_trump'])
trump_cleaner['no_opinion_trump'] = pd.to_numeric(trump_cleaner['no_opinion_trump'])

ValueError: Unable to parse string "Gallup" at position 102

In [55]:
trump_clean = pd.DataFrame()
trump_cleanest = trump_cleaner

trump_cleanest.tail(3)

Unnamed: 0,weekly,approve_trump,disapprove_trump,no_opinion_trump
101,2017 Feb 6-12,41,53,6
103,2017 Jan 20-29,45,47,8
104,Gallup,Gallup,Gallup,Gallup


In [56]:
trump_cleanest.head(3)

Unnamed: 0,weekly,approve_trump,disapprove_trump,no_opinion_trump
1,2019 Jan 2-10,37.0,59.0,4.0
2,2018,,,
3,2018 Dec 17-22,39.0,55.0,5.0


##### Getting rid of NaNs

In [57]:
trump_cleanest.isnull().any()

weekly              False
approve_trump        True
disapprove_trump     True
no_opinion_trump     True
dtype: bool

In [58]:
null_columns_trump = trump_cleanest.columns[trump_cleanest.isnull().any()]
trump_cleanest[null_columns_trump].isnull().sum()

approve_trump       2
disapprove_trump    2
no_opinion_trump    2
dtype: int64

In [59]:
trump_cleanest[trump_cleanest['approve_trump'].isnull()][null_columns_trump]

Unnamed: 0,approve_trump,disapprove_trump,no_opinion_trump
2,,,
54,,,


In [60]:
trump_new = trump_cleanest.dropna()

In [61]:
trump_new.reset_index(drop=True).head(3)

Unnamed: 0,weekly,approve_trump,disapprove_trump,no_opinion_trump
0,2019 Jan 2-10,37,59,4
1,2018 Dec 17-22,39,55,5
2,2018 Dec 10-16,38,57,4


In [62]:
trump_new.isnull().any().sum()

0

In [63]:
trump_new[50:53]

Unnamed: 0,weekly,approve_trump,disapprove_trump,no_opinion_trump
52,2018 Jan 8-14,38,57,5
53,2018 Jan 1-7,37,58,4
55,2017 Dec 25-31,39,55,6


In [64]:
tc = trump_new.copy()
tc.head()

# Separate the dates in the weekly column into a Start Date & End Date column

# Separate into usable pieces
yr = tc['weekly'].str.slice(0,4,1)
mo = tc['weekly'].str.slice(5,8,1)
lo = tc['weekly'].str.slice(9,17,1)
rs = lo.str.split('-')
rs1 = rs.str.get(0)
rs2 = rs.str.get(1)
rs3 = rs2.str.split(' ')
rs4 = rs3.str.get(0)
rs5 = rs3.str.get(1)

# Build the Start Date (sd)
tc['sd'] = mo + " " + rs1 + " " + yr

# Build the End Date (ed)
nulls = rs5.isnull()
nulls
for x in nulls:
    if x == True:
        tc['ed'] = mo + " " + rs2 + " " + yr
    else:
        tc['ed'] = rs4 + " " + rs5 + " " + yr
tc[51:105]
        
# # Create a new copy
tc2 = tc.copy()
tc2.head()

# # Fix rows that have a duplicated month in the end date column
tc2.ed[4] = tc2.ed[4][4:]
tc2.ed[8] = tc2.ed[8][4:]
tc2.ed[17] = tc2.ed[17][4:]
tc2.ed[21] = tc2.ed[21][4:]
tc2.ed[26] = tc2.ed[26][4:]
tc2.ed[30] = tc2.ed[30][4:]
tc2.ed[34] = tc2.ed[34][4:]
tc2.ed[39] = tc2.ed[39][4:]
tc2.ed[43] = tc2.ed[43][4:]
tc2.ed[47] = tc2.ed[47][4:]
tc2.ed[57] = tc2.ed[57][4:]
tc2.ed[61] = tc2.ed[61][4:]
tc2.ed[66] = tc2.ed[66][4:]
tc2.ed[70] = tc2.ed[70][4:]
tc2.ed[74] = tc2.ed[74][4:]
tc2.ed[79] = tc2.ed[79][4:]
tc2.ed[83] = tc2.ed[83][4:]
tc2.ed[92] = tc2.ed[92][4:]
tc2.ed[96] = tc2.ed[96][4:]
tc2.ed[100] = tc2.ed[100][4:]

### Cleaning Presidents Obama, Bush, Clinton, Bush Sen., Reagan, Carter, and Ford ratings

In [65]:
# Getting presidential approval ratings dating back to President Ford except Trump. (not up to date for trump) 
# 'https://www.presidency.ucsb.edu/statistics/data/presidential-job-approval'

In [66]:
# load presidential approval data paths 
obama_path = '../../obama.csv'
bush_path = '../../bush.csv'
clinton_path = '../../clinton.csv'
bush_senior_path = '../../bush_senior.csv'
reagan_path = '../../reagan.csv'
carter_path = '../../carter.csv'
ford_path = '../../ford.csv'

In [67]:
# save them into dataframes 
obama_df = pd.read_csv(obama_path)
bush_df = pd.read_csv(bush_path)
clinton_df = pd.read_csv(clinton_path)
bush_senior_df = pd.read_csv(bush_senior_path)
reagan_df = pd.read_csv(reagan_path)
carter_df = pd.read_csv(carter_path)
ford_df = pd.read_csv(ford_path)

In [68]:
bush_df.dtypes

Start Date       object
End Date         object
Approving         int64
Disapproving      int64
Unsure/NoData     int64
dtype: object

In [69]:
# Change 'Start Date' & 'End Date' columns to datetime types 
obama_df['Start Date'] = pd.to_datetime(obama_df['Start Date'])
obama_df['End Date'] = pd.to_datetime(obama_df['End Date'])

bush_df['Start Date'] = pd.to_datetime(bush_df['Start Date'])
bush_df['End Date'] = pd.to_datetime(bush_df['End Date'])

clinton_df['Start Date'] = pd.to_datetime(clinton_df['Start Date'])
clinton_df['End Date'] = pd.to_datetime(clinton_df['End Date'])

bush_senior_df['Start Date'] = pd.to_datetime(bush_senior_df['Start Date'])
bush_senior_df['End Date'] = pd.to_datetime(bush_senior_df['End Date'])

reagan_df['Start Date'] = pd.to_datetime(reagan_df['Start Date'])
reagan_df['End Date'] = pd.to_datetime(reagan_df['End Date'])

carter_df['Start Date'] = pd.to_datetime(carter_df['Start Date'])
carter_df['End Date'] = pd.to_datetime(carter_df['End Date'])

ford_df['Start Date'] = pd.to_datetime(ford_df['Start Date'])
ford_df['End Date'] = pd.to_datetime(ford_df['End Date'])

In [70]:
# Change headers to where they don't have space
obama_df = obama_df.rename(columns={
                           'Start Date':'start_date_obama',
                           'End Date':'end_date_obama',
                           'Approving':'approve_obama',
                           'Disapproving':'dissaprove_obama',
                           'Unsure/NoData':'unsure_obama'
})

bush_df = bush_df.rename(columns={
                           'Start Date':'start_date_bush',
                           'End Date':'end_date_bush',
                           'Approving':'approve_bush',
                           'Disapproving':'dissaprove_bush',
                           'Unsure/NoData':'unsure_bush'
})

clinton_df = clinton_df.rename(columns={
                           'Start Date':'start_date_clinton',
                           'End Date':'end_date_clinton',
                           'Approving':'approve_clinton',
                           'Disapproving':'dissaprove_clinton',
                           'Unsure/NoData':'unsure_clinton'
})

bush_senior_df = bush_senior_df.rename(columns={
                           'Start Date':'start_date_bush_senior',
                           'End Date':'end_date_bush_senior',
                           'Approving':'approve_bush_senior',
                           'Disapproving':'dissaprove_bush_senior',
                           'Unsure/NoData':'unsure_bush_senior'
})

reagan_df = reagan_df.rename(columns={
                           'Start Date':'start_date_reagan',
                           'End Date':'end_date_reagan',
                           'Approving':'approve_reagan',
                           'Disapproving':'dissaprove_reagan',
                           'Unsure/NoData':'unsure_reagan'
})

carter_df = carter_df.rename(columns={
                           'Start Date':'start_date_carter',
                           'End Date':'end_date_carter',
                           'Approving':'approve_carter',
                           'Disapproving':'dissaprove_carter',
                           'Unsure/NoData':'unsure_carter'
})

ford_df = ford_df.rename(columns={
                           'Start Date':'start_date_ford',
                           'End Date':'end_date_ford',
                           'Approving':'approve_ford',
                           'Disapproving':'dissaprove_ford',
                           'Unsure/NoData':'unsure_ford'
})

In [71]:
# check for null values in former Presidents' dataframes
print(obama_df.isnull().any())
print('----'*15)

print(bush_df.isnull().any())
print('----'*15)

print(clinton_df.isnull().any())
print('----'*15)

print(bush_senior_df.isnull().any())
print('----'*15)

print(reagan_df.isnull().any())
print('----'*15)

print(carter_df.isnull().any())
print('----'*15)

print(ford_df.isnull().any())
print('----'*15)

start_date_obama    False
end_date_obama      False
approve_obama       False
dissaprove_obama    False
unsure_obama        False
dtype: bool
------------------------------------------------------------
start_date_bush    False
end_date_bush      False
approve_bush       False
dissaprove_bush    False
unsure_bush        False
dtype: bool
------------------------------------------------------------
start_date_clinton    False
end_date_clinton      False
approve_clinton       False
dissaprove_clinton    False
unsure_clinton        False
dtype: bool
------------------------------------------------------------
start_date_bush_senior    False
end_date_bush_senior      False
approve_bush_senior       False
dissaprove_bush_senior    False
unsure_bush_senior        False
dtype: bool
------------------------------------------------------------
start_date_reagan    False
end_date_reagan      False
approve_reagan       False
dissaprove_reagan    False
unsure_reagan        False
dtype: bool
------

### Congress Approval Cleaning

In [72]:
# url for congress approval ratings
url_congr = 'https://news.gallup.com/poll/1600/congress-public.aspx'

In [73]:
# loading and saving congress' approval ratings into a dataframe 
list_congr = pd.read_html(url_congr)
congress_df = list_congr[0]

In [74]:
congress_df.head(2)

Unnamed: 0_level_0,Unnamed: 0_level_0,Approve,Disapprove,No opinion,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,...,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0
Unnamed: 0_level_1,Unnamed: 0_level_1.1,%,%,%,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,...,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
0,2018 Dec 3-12,18,75,7,,,,,,,...,,,,,,,,,,
1,2018 Nov 1-11,21,74,5,,,,,,,...,,,,,,,,,,


In [75]:
# Drop a level from headers
congress_df.columns = congress_df.columns.droplevel()

In [76]:
# printing the length of columns array
len(congress_df.columns.values)

43

In [77]:
congress_df = congress_df.drop(congress_df.index[333])

In [78]:
# dropping some columns
congress_cleaning = congress_df[['Unnamed: 0_level_1','%']]

In [79]:
congress_clean = congress_cleaning.rename(columns={
    'Unnamed: 0_level_1':'weekly','%':'approve','%':'disapprove','%':'no_opinion_congress'
})

In [80]:
# print last five rows
congress_clean.head(3)

Unnamed: 0,weekly,no_opinion_congress,no_opinion_congress.1,no_opinion_congress.2
0,2018 Dec 3-12,18,75,7
1,2018 Nov 1-11,21,74,5
2,2018 Oct 15-28,21,74,6


In [81]:
# check for null values
congress_clean.isnull().any()

weekly                 False
no_opinion_congress    False
no_opinion_congress    False
no_opinion_congress    False
dtype: bool

##### Trying to rename by index

In [82]:
# changing column names by index
congress_clean.columns.values[1] = 'approve_congress'
congress_clean.columns.values[2] = 'disapprove_congress'

In [83]:
# check types
congress_clean.dtypes

weekly                 object
approve_congress       object
disapprove_congress    object
no_opinion_congress    object
dtype: object

In [84]:
# Setting approve, disapprove and no_opinion columns to numeric types
congress_clean['approve_congress'] = pd.to_numeric(congress_clean['approve_congress'])
congress_clean['disapprove_congress'] = pd.to_numeric(congress_clean['disapprove_congress'])
congress_clean['no_opinion_congress'] = pd.to_numeric(congress_clean['no_opinion_congress'])

In [85]:
congress_clean.head(3)

Unnamed: 0,weekly,approve_congress,disapprove_congress,no_opinion_congress
0,2018 Dec 3-12,18,75,7
1,2018 Nov 1-11,21,74,5
2,2018 Oct 15-28,21,74,6


##### This part will deal with slicing 'weekly' column

In [86]:
cc = congress_clean.copy()

# Separate the dates in the weekly column into a Start Date & End Date column

# Separate into usable pieces
yr = cc['weekly'].str.slice(0,4,1)
mo = cc['weekly'].str.slice(5,8,1)
lo = cc['weekly'].str.slice(9,17,1)
rs = lo.str.split('-')
rs1 = rs.str.get(0)
rs2 = rs.str.get(1)
rs3 = rs2.str.split(' ')
rs4 = rs3.str.get(0)
rs5 = rs3.str.get(1)

# Build the Start Date (sd)
cc['sd'] = mo + " " + rs1 + " " + yr

# Build the End Date (ed)
nulls = rs5.isnull()
nulls
for x in nulls:
    if x == True:
        cc['ed'] = mo + " " + rs2 + " " + yr
    else:
        cc['ed'] = rs4 + " " + rs5 + " " + yr

# Create a new copy
cc2 = cc.copy()
cc2.head()


# Drop row 304 -- 304 1992 Mar 3 ^ 18 78 3 Mar 3 ^ 1992 NaN
cc3 = cc2.drop([304], axis=0)
cc3


# Reset the index after the deletion of row 304
cc4 = cc3.reset_index(drop=True)
cc4


cc5 = cc4.copy()

# Fix 10 rows that have a duplicated month in the end date column
cc5.ed[51] = cc5.ed[51][4:]
cc5.ed[114] = cc5.ed[114][4:]
cc5.ed[251] = cc5.ed[251][4:]
cc5.ed[262] = cc5.ed[262][4:]
cc5.ed[269] = cc5.ed[269][4:]
cc5.ed[278] = cc5.ed[278][4:]
cc5.ed[311] = cc5.ed[311][4:]
cc5.ed[313] = cc5.ed[313][4:]
cc5.ed[325] = cc5.ed[325][4:]
cc5.ed[328] = cc5.ed[328][4:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [87]:
cc5.sd = pd.to_datetime(cc5.sd)
cc5.ed = pd.to_datetime(cc5.ed)

In [88]:
cc5.dtypes

weekly                         object
approve_congress                int64
disapprove_congress             int64
no_opinion_congress             int64
sd                     datetime64[ns]
ed                     datetime64[ns]
dtype: object

#### Cleaning Unemployment Rate Data

In [89]:
# Federal Reserve Bank of St. Louis (Monthly Unemployment rate)
url_unemp = 'http://www.multpl.com/unemployment/table?f=m'

In [90]:
# loading data
list_unemp = pd.read_html(url_unemp)

In [91]:
# saving into a dataframe 
unemp_df = list_unemp[0]

In [92]:
# Look up
unemp_df.head(3)

Unnamed: 0,0,1
0,Date,Rate Value
1,"Dec 1, 2018",3.90%
2,"Nov 1, 2018",3.70%


In [93]:
# Rename headers
unemp_df = unemp_df.rename(columns={
    0:'date_unemp',1:'unemp_rate'
})

In [94]:
# Drop first row
unemp_clean = unemp_df.drop(unemp_df.index[0])

In [95]:
# Reset index
unemp_clean.reset_index(drop=True).head(2)

Unnamed: 0,date_unemp,unemp_rate
0,"Dec 1, 2018",3.90%
1,"Nov 1, 2018",3.70%


In [96]:
# Change types
unemp_clean.date_unemp = pd.to_datetime(unemp_clean.date_unemp)

In [97]:
# unemp_clean['unemp_rate'] = unemp_clean['unemp_rate'].str.rstrip('%').astype('float')