In [4]:
import pandas as pd 

In [5]:
# loading url of party control 1933-2010 from southeast missoury state univerity
url_semo = 'http://cstl-cla.semo.edu/rdrenka/ui320-75/presandcongress.asp'

In [6]:
# Creating a dataframe 
list_semo = pd.read_html(url_semo)

# DataFrame
df = list_semo[0]

#### Cleaning Data

In [7]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Years,President,Party of President,Congress,House of Representatives,Senate,Government is:,,,,
1,Democrats,Republicans,Other,Democrats,Republicans,Other,,,,,
2,1933-34,Roosevelt,D,73rd,313,117,5,59.0,36,1.0,unified
3,1935-36,Roosevelt,D,74th,322,103,10,69.0,25,2.0,unified
4,1937-38,Roosevelt,D,75th,333,89,13,75.0,17,4.0,unified
5,1939-40,Roosevelt,D,76th,262,169,4,69.0,23,4.0,unified
6,1941-42,Roosevelt,D,77th,267,162,6,66.0,28,2.0,unified
7,1943-44,Roosevelt,D,78th,222,209,4,57.0,38,1.0,unified
8,1945-46,Roosevelt/Truman,D,79th,243,190,2,57.0,38,1.0,unified
9,1947-48,Truman,D,80th,188,246,1,45.0,51,0.0,divided


In [8]:
list_del = [0,1,41,42]

In [9]:
# Create a list of range 2 to 22 inclusive 
list_range = list(range(2,23))

In [10]:
# extend first list to the range list
list_del.extend(list_range)

In [11]:
# drop the indices in our dataframe using list_del list 
party_shutdown_df = df.drop(df.index[list_del])

In [12]:
party_with_headers = party_shutdown_df.rename(columns={
    0:"years",1:"president",2:"party_president",4:"house_dem_seats",5:"house_rep_seats",6:"house_other_seats",
    7:"senate_dem_seats",8:"senate_rep_seats",9:"senate_other_seats",10:"gov_status"
})

# delete column with header '3'
del party_with_headers[3]

In [13]:
# resetting dataframe index 
party_with_headers.reset_index(drop=True)

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified
3,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR)
4,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR)
5,1985-86,Reagan,R,253,182,0,47.0,53,0.0,divided (HR)
6,1987-88,Reagan,R,258,177,0,55.0,45,0.0,divided
7,1989-90,"Bush, GHW",R,260,175,0,55.0,45,0.0,divided
8,1991-92,Bush,R,267,167,1,56.0,44,0.0,divided
9,1993-94,Clinton,D,258,176,1,57.0,43,0.0,unified


In [14]:
# checking if there is any NaN value left in the dataframe 
party_with_headers.isnull().values.any()

False

In [15]:
# looking up types of columns in the dataframe
party_with_headers.dtypes

years                  object
president              object
party_president        object
house_dem_seats        object
house_rep_seats        object
house_other_seats      object
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

In [16]:
# create a copy of dataframe to change types
party_df_types = party_with_headers

In [17]:
# Converting three series to numerics
party_df_types['house_dem_seats'] = pd.to_numeric(party_df_types['house_dem_seats'])
party_df_types['house_rep_seats'] = pd.to_numeric(party_df_types['house_rep_seats'])
party_df_types['house_other_seats'] = pd.to_numeric(party_df_types['house_other_seats'])

In [18]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats       object
senate_other_seats    float64
gov_status             object
dtype: object

##### 'senate_rep_seats' cannot be changed to float or int yet because of the value '49*'

In [19]:
# using .loc to locate the "49*" value and changing it to '49'
party_df_types['senate_rep_seats'].loc[party_df_types['senate_rep_seats'] == "49*"] = '49'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [20]:
party_df_types['senate_rep_seats'] = pd.to_numeric(party_df_types['senate_rep_seats'])

In [21]:
party_df_types.dtypes

years                  object
president              object
party_president        object
house_dem_seats         int64
house_rep_seats         int64
house_other_seats       int64
senate_dem_seats      float64
senate_rep_seats        int64
senate_other_seats    float64
gov_status             object
dtype: object

In [22]:
# resetting index again just because anaconda is confused
party_df_types.reset_index(drop=True)

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
0,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided
1,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified
2,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified
3,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR)
4,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR)
5,1985-86,Reagan,R,253,182,0,47.0,53,0.0,divided (HR)
6,1987-88,Reagan,R,258,177,0,55.0,45,0.0,divided
7,1989-90,"Bush, GHW",R,260,175,0,55.0,45,0.0,divided
8,1991-92,Bush,R,267,167,1,56.0,44,0.0,divided
9,1993-94,Clinton,D,258,176,1,57.0,43,0.0,unified


In [37]:
#Series.str.slice(start=None, stop=None, step=None)
party_df_types["from_year"] = party_df_types["years"].str.slice(0,4,1)
party_df_types

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,from_year,to_year
23,1975-76,Ford,r,291,144,0,61.0,38,1.0,divided,1975,1976
24,1977-78,Carter,d,292,143,0,62.0,38,0.0,unified,1977,1978
25,1979-80,Carter,d,277,158,0,59.0,41,0.0,unified,1979,1980
26,1981-82,Reagan,r,243,192,0,47.0,53,0.0,divided (HR),1981,1982
27,1983-84,Reagan,r,269,166,0,46.0,54,0.0,divided (HR),1983,1984
28,1985-86,Reagan,r,253,182,0,47.0,53,0.0,divided (HR),1985,1986
29,1987-88,Reagan,r,258,177,0,55.0,45,0.0,divided,1987,1988
30,1989-90,"Bush, GHW",r,260,175,0,55.0,45,0.0,divided,1989,1990
31,1991-92,Bush,r,267,167,1,56.0,44,0.0,divided,1991,1992
32,1993-94,Clinton,d,258,176,1,57.0,43,0.0,unified,1993,1994


In [24]:
# party_df_types["to_year"] = party_df_types["years"].str.slice(5,7,1)
# party_df_types

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,from_year,to_year
23,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided,1975,76
24,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified,1977,78
25,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified,1979,80
26,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR),1981,82
27,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR),1983,84
28,1985-86,Reagan,R,253,182,0,47.0,53,0.0,divided (HR),1985,86
29,1987-88,Reagan,R,258,177,0,55.0,45,0.0,divided,1987,88
30,1989-90,"Bush, GHW",R,260,175,0,55.0,45,0.0,divided,1989,90
31,1991-92,Bush,R,267,167,1,56.0,44,0.0,divided,1991,92
32,1993-94,Clinton,D,258,176,1,57.0,43,0.0,unified,1993,94


In [38]:
party_df_types["to_year"] = party_df_types["years"].str.slice(0,2,1) + party_df_types["years"].str.slice(5,7,1)
party_df_types

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,from_year,to_year
23,1975-76,Ford,r,291,144,0,61.0,38,1.0,divided,1975,1976
24,1977-78,Carter,d,292,143,0,62.0,38,0.0,unified,1977,1978
25,1979-80,Carter,d,277,158,0,59.0,41,0.0,unified,1979,1980
26,1981-82,Reagan,r,243,192,0,47.0,53,0.0,divided (HR),1981,1982
27,1983-84,Reagan,r,269,166,0,46.0,54,0.0,divided (HR),1983,1984
28,1985-86,Reagan,r,253,182,0,47.0,53,0.0,divided (HR),1985,1986
29,1987-88,Reagan,r,258,177,0,55.0,45,0.0,divided,1987,1988
30,1989-90,"Bush, GHW",r,260,175,0,55.0,45,0.0,divided,1989,1990
31,1991-92,Bush,r,267,167,1,56.0,44,0.0,divided,1991,1992
32,1993-94,Clinton,d,258,176,1,57.0,43,0.0,unified,1993,1994


In [26]:
# party_df_types["from_year"] = (party_df_types["years"])[:4]

In [27]:
party_df_types

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,from_year,to_year
23,1975-76,Ford,R,291,144,0,61.0,38,1.0,divided,1975-76,1976
24,1977-78,Carter,D,292,143,0,62.0,38,0.0,unified,1977-78,1978
25,1979-80,Carter,D,277,158,0,59.0,41,0.0,unified,1979-80,1980
26,1981-82,Reagan,R,243,192,0,47.0,53,0.0,divided (HR),1981-82,1982
27,1983-84,Reagan,R,269,166,0,46.0,54,0.0,divided (HR),,1984
28,1985-86,Reagan,R,253,182,0,47.0,53,0.0,divided (HR),,1986
29,1987-88,Reagan,R,258,177,0,55.0,45,0.0,divided,,1988
30,1989-90,"Bush, GHW",R,260,175,0,55.0,45,0.0,divided,,1990
31,1991-92,Bush,R,267,167,1,56.0,44,0.0,divided,,1992
32,1993-94,Clinton,D,258,176,1,57.0,43,0.0,unified,,1994


In [28]:
party_df_types["party_president"] = party_df_types["party_president"].str.lower()
party_df_types

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,from_year,to_year
23,1975-76,Ford,r,291,144,0,61.0,38,1.0,divided,1975-76,1976
24,1977-78,Carter,d,292,143,0,62.0,38,0.0,unified,1977-78,1978
25,1979-80,Carter,d,277,158,0,59.0,41,0.0,unified,1979-80,1980
26,1981-82,Reagan,r,243,192,0,47.0,53,0.0,divided (HR),1981-82,1982
27,1983-84,Reagan,r,269,166,0,46.0,54,0.0,divided (HR),,1984
28,1985-86,Reagan,r,253,182,0,47.0,53,0.0,divided (HR),,1986
29,1987-88,Reagan,r,258,177,0,55.0,45,0.0,divided,,1988
30,1989-90,"Bush, GHW",r,260,175,0,55.0,45,0.0,divided,,1990
31,1991-92,Bush,r,267,167,1,56.0,44,0.0,divided,,1992
32,1993-94,Clinton,d,258,176,1,57.0,43,0.0,unified,,1994


In [29]:
test1 = party_df_types["party_president"]
print( test1)
test1 = test1.str.lower()
print(test1)

23    r
24    d
25    d
26    r
27    r
28    r
29    r
30    r
31    r
32    d
33    d
34    d
35    d
36    r
37    r
38    r
39    r
40    d
Name: party_president, dtype: object
23    r
24    d
25    d
26    r
27    r
28    r
29    r
30    r
31    r
32    d
33    d
34    d
35    d
36    r
37    r
38    r
39    r
40    d
Name: party_president, dtype: object


In [30]:
#df['column name'].astype(np.int64)
import numpy as np
party_df_types2 = party_df_types.copy()
party_df_types2
party_df_types2["senate_dem_seats"] = party_df_types2["senate_dem_seats"].astype(np.int64)
party_df_types2

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,from_year,to_year
23,1975-76,Ford,r,291,144,0,61,38,1.0,divided,1975-76,1976
24,1977-78,Carter,d,292,143,0,62,38,0.0,unified,1977-78,1978
25,1979-80,Carter,d,277,158,0,59,41,0.0,unified,1979-80,1980
26,1981-82,Reagan,r,243,192,0,47,53,0.0,divided (HR),1981-82,1982
27,1983-84,Reagan,r,269,166,0,46,54,0.0,divided (HR),,1984
28,1985-86,Reagan,r,253,182,0,47,53,0.0,divided (HR),,1986
29,1987-88,Reagan,r,258,177,0,55,45,0.0,divided,,1988
30,1989-90,"Bush, GHW",r,260,175,0,55,45,0.0,divided,,1990
31,1991-92,Bush,r,267,167,1,56,44,0.0,divided,,1992
32,1993-94,Clinton,d,258,176,1,57,43,0.0,unified,,1994


In [31]:
party_df_types2["senate_other_seats"] = party_df_types2["senate_other_seats"].astype(np.int64)
party_df_types2

Unnamed: 0,years,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status,from_year,to_year
23,1975-76,Ford,r,291,144,0,61,38,1,divided,1975-76,1976
24,1977-78,Carter,d,292,143,0,62,38,0,unified,1977-78,1978
25,1979-80,Carter,d,277,158,0,59,41,0,unified,1979-80,1980
26,1981-82,Reagan,r,243,192,0,47,53,0,divided (HR),1981-82,1982
27,1983-84,Reagan,r,269,166,0,46,54,0,divided (HR),,1984
28,1985-86,Reagan,r,253,182,0,47,53,0,divided (HR),,1986
29,1987-88,Reagan,r,258,177,0,55,45,0,divided,,1988
30,1989-90,"Bush, GHW",r,260,175,0,55,45,0,divided,,1990
31,1991-92,Bush,r,267,167,1,56,44,0,divided,,1992
32,1993-94,Clinton,d,258,176,1,57,43,0,unified,,1994


In [32]:
for i in party_df_types2.columns:
    print(i)

years
president
party_president
house_dem_seats
house_rep_seats
house_other_seats
senate_dem_seats
senate_rep_seats
senate_other_seats
gov_status
from_year
to_year


In [39]:
party_df_types3 = party_df_types[["years", "from_year", "to_year", "president", "party_president", "house_dem_seats",
                                   "house_rep_seats", "house_other_seats", "senate_dem_seats", "senate_rep_seats",
                                   "senate_other_seats", "gov_status"]]
party_df_types3

Unnamed: 0,years,from_year,to_year,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
23,1975-76,1975,1976,Ford,r,291,144,0,61.0,38,1.0,divided
24,1977-78,1977,1978,Carter,d,292,143,0,62.0,38,0.0,unified
25,1979-80,1979,1980,Carter,d,277,158,0,59.0,41,0.0,unified
26,1981-82,1981,1982,Reagan,r,243,192,0,47.0,53,0.0,divided (HR)
27,1983-84,1983,1984,Reagan,r,269,166,0,46.0,54,0.0,divided (HR)
28,1985-86,1985,1986,Reagan,r,253,182,0,47.0,53,0.0,divided (HR)
29,1987-88,1987,1988,Reagan,r,258,177,0,55.0,45,0.0,divided
30,1989-90,1989,1990,"Bush, GHW",r,260,175,0,55.0,45,0.0,divided
31,1991-92,1991,1992,Bush,r,267,167,1,56.0,44,0.0,divided
32,1993-94,1993,1994,Clinton,d,258,176,1,57.0,43,0.0,unified


In [44]:
party_df_types3["senate_dem_seats"] = party_df_types3["senate_dem_seats"].astype(np.int64)
party_df_types3["senate_other_seats"] = party_df_types3["senate_other_seats"].astype(np.int64)
party_df_types3["party_president"] = party_df_types3["party_president"].str.upper()
party_df_types3

Unnamed: 0,years,from_year,to_year,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
23,1975-76,1975,1976,Ford,R,291,144,0,61,38,1,divided
24,1977-78,1977,1978,Carter,D,292,143,0,62,38,0,unified
25,1979-80,1979,1980,Carter,D,277,158,0,59,41,0,unified
26,1981-82,1981,1982,Reagan,R,243,192,0,47,53,0,divided (HR)
27,1983-84,1983,1984,Reagan,R,269,166,0,46,54,0,divided (HR)
28,1985-86,1985,1986,Reagan,R,253,182,0,47,53,0,divided (HR)
29,1987-88,1987,1988,Reagan,R,258,177,0,55,45,0,divided
30,1989-90,1989,1990,"Bush, GHW",R,260,175,0,55,45,0,divided
31,1991-92,1991,1992,Bush,R,267,167,1,56,44,0,divided
32,1993-94,1993,1994,Clinton,D,258,176,1,57,43,0,unified


In [46]:
party_df_types4 = party_df_types3.reset_index(drop=True)
party_df_types4

Unnamed: 0,years,from_year,to_year,president,party_president,house_dem_seats,house_rep_seats,house_other_seats,senate_dem_seats,senate_rep_seats,senate_other_seats,gov_status
0,1975-76,1975,1976,Ford,R,291,144,0,61,38,1,divided
1,1977-78,1977,1978,Carter,D,292,143,0,62,38,0,unified
2,1979-80,1979,1980,Carter,D,277,158,0,59,41,0,unified
3,1981-82,1981,1982,Reagan,R,243,192,0,47,53,0,divided (HR)
4,1983-84,1983,1984,Reagan,R,269,166,0,46,54,0,divided (HR)
5,1985-86,1985,1986,Reagan,R,253,182,0,47,53,0,divided (HR)
6,1987-88,1987,1988,Reagan,R,258,177,0,55,45,0,divided
7,1989-90,1989,1990,"Bush, GHW",R,260,175,0,55,45,0,divided
8,1991-92,1991,1992,Bush,R,267,167,1,56,44,0,divided
9,1993-94,1993,1994,Clinton,D,258,176,1,57,43,0,unified
