### Cleaning Data

In [164]:
import pandas as pd 

##### President and Congress 1975 through 2010

In [68]:
# loading url of party control 1933-2010 from southeast missoury state univerity
url_semo = 'http://cstl-cla.semo.edu/rdrenka/ui320-75/presandcongress.asp'

In [69]:
# Creating a dataframe 
list_semo = pd.read_html(url_semo)

# DataFrame
df = list_semo[0]

In [70]:
list_del = [0,1,41,42]

In [71]:
# Create a list of range 2 to 22 inclusive 
list_range = list(range(2,23))

In [72]:
# extend first list to the range list
list_del.extend(list_range)

In [73]:
# drop the indices in our dataframe using list_del list 
party_shutdown_df = df.drop(df.index[list_del])

In [74]:
party_with_headers = party_shutdown_df.rename(columns={
    0:"years",1:"president",2:"party_president",4:"house_dem_seats",5:"house_rep_seats",6:"house_other_seats",
    7:"senate_dem_seats",8:"senate_rep_seats",9:"senate_other_seats",10:"gov_status"
})

# delete column with header '3'
del party_with_headers[3]

In [163]:
# resetting dataframe index 
party_with_headers.reset_index(drop=True).head()

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,term_start,term_end
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided,1975,1976
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified,1977,1978
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified,1979,1980
3,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR),1981,1982
4,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR),1983,1984


In [76]:
# checking if there is any NaN value left in the dataframe 
party_with_headers.isnull().values.any()

False

In [77]:
# looking up types of columns in the dataframe
party_with_headers.dtypes

years                  object
president              object
party_president        object
house_dem_seats        object
house_rep_seats        object
house_other_seats      object
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

In [78]:
# create a copy of dataframe to change types
party_df_types = party_with_headers

In [79]:
# Converting three series to numerics
party_df_types['house_dem_seats'] = pd.to_numeric(party_df_types['house_dem_seats'])
party_df_types['house_rep_seats'] = pd.to_numeric(party_df_types['house_rep_seats'])
party_df_types['house_other_seats'] = pd.to_numeric(party_df_types['house_other_seats'])

In [80]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

##### 'senate_rep_seats' cannot be changed to float or int yet because of the value '49*'

In [81]:
# using .loc to locate the "49*" value and changing it to '49'
party_df_types['senate_rep_seats'].loc[party_df_types['senate_rep_seats'] == "49*"] = '49'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [82]:
party_df_types['senate_rep_seats'] = pd.to_numeric(party_df_types['senate_rep_seats'])

In [83]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats        int64
senate_other_seats    float64
gov_status             object
dtype: object

In [161]:
# resetting index again just because anaconda is confused
party_df_types.reset_index(drop=True).head()

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,term_start,term_end
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided,1975,1976
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified,1977,1978
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified,1979,1980
3,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR),1981,1982
4,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR),1983,1984


In [160]:
party_df_types["term_start"] = party_df_types["years"].str.slice(0,4,1)
party_df_types

party_df_types["term_end"] = party_df_types["years"].str.slice(0,2,1) + party_df_types["years"].str.slice(5,7,1)
party_df_types.head()

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,term_start,term_end
23,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided,1975,1976
24,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified,1977,1978
25,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified,1979,1980
26,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR),1981,1982
27,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR),1983,1984


##### President Trump Approval Ratings

In [87]:
# url for President Trump approval ratings 
url_trump_gallup = "https://news.gallup.com/poll/203198/presidential-approval-ratings-donald-trump.aspx"

In [88]:
# Read the url
list_trump_gallup = pd.read_html(url_trump_gallup)

In [146]:
trump_clean.columns

MultiIndex(levels=[['Approve', 'Disapprove', 'No opinion', 'Unnamed: 0_level_0'], ['%', 'Unnamed: 0_level_1']],
           labels=[[3, 0, 1, 2], [1, 0, 0, 0]])

In [150]:
# save the section in dataframe to clean 
trump_not_clean = list_trump_gallup[2]
list_trump_del = [0,102]
# getting rid of second row and last
trump_clean = trump_not_clean.drop(trump_not_clean.index[list_trump_del])

In [None]:
# drop one level of the multi index column headers
trump_clean.columns = trump_clean.columns.droplevel()

In [154]:
# Column names
trump_clean.columns

Index(['Unnamed: 0_level_1', '%', '%', '%'], dtype='object')

In [155]:
trump_clean.head(2)

Unnamed: 0,Unnamed: 0_level_1,%,%.1,%.2
1,2018 Dec 17-22,39,55,5
2,2018 Dec 10-16,38,57,4


In [162]:
# Renaming column headers
trump_cleaner = trump_clean.rename(columns={
    'Unnamed: 0_level_1': 'weekly', '%':'approve','%':'disapprove','%':'no_opinion'
})

trump_cleaner.reset_index(drop=True).head()

Unnamed: 0,weekly,no_opinion,no_opinion.1,no_opinion.2
0,2018 Dec 17-22,39,55,5
1,2018 Dec 10-16,38,57,4
2,2018 Dec 3-9,40,56,4
3,2018 Nov 26-Dec 2,40,56,4
4,2018 Nov 19-25,38,60,3


##### Presidents Obama, Bush, Clinton, Bush Sen., Reagan, Carter, and Ford ratings

In [86]:
# Getting presidential approval ratings dating back to President Ford except Trump. (not up to date for trump) 
# 'https://www.presidency.ucsb.edu/statistics/data/presidential-job-approval'

In [104]:
# load presidential approval data paths 
obama_path = '../../obama.csv'
bush_path = '../../bush.csv'
clinton_path = '../../clinton.csv'
bush_senior_path = '../../bush_senior.csv'
reagan_path = '../../reagan.csv'
carter_path = '../../carter.csv'
ford_path = '../../ford.csv'

In [105]:
# save them into dataframes 
obama_df = pd.read_csv(obama_path)
bush_df = pd.read_csv(bush_path)
clinton_df = pd.read_csv(clinton_path)
bush_senior_df = pd.read_csv(bush_senior_path)
reagan_df = pd.read_csv(reagan_path)
carter_df = pd.read_csv(carter_path)
ford_df = pd.read_csv(ford_path)

##### Congress Approval

In [197]:
# url for congress approval ratings
url_congr = 'https://news.gallup.com/poll/1600/congress-public.aspx'

In [198]:
# loading and saving congress' approval ratings into a dataframe 
list_congr = pd.read_html(url_congr)
congress_df = list_congr[0]

In [199]:
congress_df.head(2)

Unnamed: 0_level_0,Unnamed: 0_level_0,Approve,Disapprove,No opinion,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,...,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0
Unnamed: 0_level_1,Unnamed: 0_level_1.1,%,%,%,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,...,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
0,2018 Dec 3-12,18,75,7,,,,,,,...,,,,,,,,,,
1,2018 Nov 1-11,21,74,5,,,,,,,...,,,,,,,,,,


In [200]:
# Drop a level from headers
congress_df.columns = congress_df.columns.droplevel()

In [201]:
# printing the length of columns array
len(congress_df.columns.values)

43

In [202]:
congress_df = congress_df.drop(congress_df.index[333])

In [203]:
# dropping some columns
congress_cleaning = congress_df[['Unnamed: 0_level_1','%']]

In [205]:
congress_clean = congress_cleaning.rename(columns={
    'Unnamed: 0_level_1':'weekly','%':'approve','%':'disapprove','%':'no_opinion'
})

In [208]:
# print last five rows
congress_clean.tail()

Unnamed: 0,weekly,no_opinion,no_opinion.1,no_opinion.2
328,1975 Apr 18-21,38,48,14
329,1975 Feb 28-Mar 3,32,50,18
330,1974 Oct 11-14,35,43,22
331,1974 Aug 16-19,47,34,19
332,1974 Apr 12-15,30,47,23


In [209]:
congress_clean.dtypes

weekly        object
no_opinion    object
no_opinion    object
no_opinion    object
dtype: object