In [22]:
import pandas as pd 

In [23]:
# loading url of party control 1933-2010 from southeast missoury state univerity
url_semo = 'http://cstl-cla.semo.edu/rdrenka/ui320-75/presandcongress.asp'

In [24]:
# Creating a dataframe 
list_semo = pd.read_html(url_semo)

# DataFrame
df = list_semo[0]

#### Cleaning Data

In [25]:
list_del = [0,1,41,42]

In [26]:
# Create a list of range 2 to 22 inclusive 
list_range = list(range(2,23))

In [27]:
# extend first list to the range list
list_del.extend(list_range)

In [28]:
# drop the indices in our dataframe using list_del list 
party_shutdown_df = df.drop(df.index[list_del])

In [29]:
party_with_headers = party_shutdown_df.rename(columns={
    0:"years",1:"president",2:"party_president",4:"house_dem_seats",5:"house_rep_seats",6:"house_other_seats",
    7:"senate_dem_seats",8:"senate_rep_seats",9:"senate_other_seats",10:"gov_status"
})

# delete column with header '3'
del party_with_headers[3]

In [30]:
# resetting dataframe index 
party_with_headers.reset_index(drop=True)

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified
3,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR)
4,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR)
5,1985-86,Reagan,R,253,182,0,47.0,53,0.0,divided (HR)
6,1987-88,Reagan,R,258,177,0,55.0,45,0.0,divided
7,1989-90,"Bush, GHW",R,260,175,0,55.0,45,0.0,divided
8,1991-92,Bush,R,267,167,1,56.0,44,0.0,divided
9,1993-94,Clinton,D,258,176,1,57.0,43,0.0,unified


In [31]:
# checking if there is any NaN value left in the dataframe 
party_with_headers.isnull().values.any()

False

In [32]:
# looking up types of columns in the dataframe
party_with_headers.dtypes

years                  object
president              object
party_president        object
house_dem_seats        object
house_rep_seats        object
house_other_seats      object
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

In [33]:
# create a copy of dataframe to change types
party_df_types = party_with_headers

In [34]:
# Converting three series to numerics
party_df_types['house_dem_seats'] = pd.to_numeric(party_df_types['house_dem_seats'])
party_df_types['house_rep_seats'] = pd.to_numeric(party_df_types['house_rep_seats'])
party_df_types['house_other_seats'] = pd.to_numeric(party_df_types['house_other_seats'])

In [35]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

##### 'senate_rep_seats' cannot be changed to float or int yet because of the value '49*'

In [36]:
# using .loc to locate the "49*" value and changing it to '49'
party_df_types['senate_rep_seats'].loc[party_df_types['senate_rep_seats'] == "49*"] = '49'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [37]:
party_df_types['senate_rep_seats'] = pd.to_numeric(party_df_types['senate_rep_seats'])

In [38]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats        int64
senate_other_seats    float64
gov_status             object
dtype: object

In [39]:
# resetting index again just because anaconda is confused
party_df_types.reset_index(drop=True)

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified
3,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR)
4,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR)
5,1985-86,Reagan,R,253,182,0,47.0,53,0.0,divided (HR)
6,1987-88,Reagan,R,258,177,0,55.0,45,0.0,divided
7,1989-90,"Bush, GHW",R,260,175,0,55.0,45,0.0,divided
8,1991-92,Bush,R,267,167,1,56.0,44,0.0,divided
9,1993-94,Clinton,D,258,176,1,57.0,43,0.0,unified


In [40]:
party_df_types["term_start"] = party_df_types["years"].str.slice(0,4,1)
party_df_types

party_df_types["term_end"] = party_df_types["years"].str.slice(0,2,1) + party_df_types["years"].str.slice(5,7,1)
party_df_types

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,term_start,term_end
23,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided,1975,1976
24,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified,1977,1978
25,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified,1979,1980
26,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR),1981,1982
27,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR),1983,1984
28,1985-86,Reagan,R,253,182,0,47.0,53,0.0,divided (HR),1985,1986
29,1987-88,Reagan,R,258,177,0,55.0,45,0.0,divided,1987,1988
30,1989-90,"Bush, GHW",R,260,175,0,55.0,45,0.0,divided,1989,1990
31,1991-92,Bush,R,267,167,1,56.0,44,0.0,divided,1991,1992
32,1993-94,Clinton,D,258,176,1,57.0,43,0.0,unified,1993,1994


In [54]:
# Getting presidential approval ratings dating back to President Ford except Trump. (not up to date for trump) 
# 'https://www.presidency.ucsb.edu/statistics/data/presidential-job-approval'

In [55]:
# url for President Trump approval ratings 
url_trump_gallup = "https://news.gallup.com/poll/203198/presidential-approval-ratings-donald-trump.aspx"

In [56]:
# Read the url
list_trump_gallup = pd.read_html(url_trump_gallup)

In [62]:
# save the section in dataframe to clean 
trump_not_clean = list_trump_gallup[2]

In [65]:
# load presidential approval data paths 
obama_path = '../obama.csv'
bush_path = '../bush.csv'
clinton_path = '../clinton.csv'
bush_senior_path = '../bush_senior.csv'
reagan_path = '../reagan.csv'
carter_path = '../carter.csv'
ford_path = '../ford.csv'

In [68]:
# save them into dataframes 
obama_df = pd.read_csv(obama_path)
bush_df = pd.read_csv(bush_path)
clinton_df = pd.read_csv(clinton_path)
bush_senior_df = pd.read_csv(bush_senior_path)
reagan_df = pd.read_csv(reagan_path)
carter_df = pd.read_csv(carter_path)
ford_df = pd.read_csv(ford_path)