In [1]:
# first, we import the relevent packages.

# for data manipulation
import pandas as pd
import datetime as dt

# to scrape the web
from bs4 import BeautifulSoup as soupy
import requests
import time
import random

# dealing with consistent ssl error
import ssl

# for maths
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# for writing dictionaries and others as files
import json

In [2]:
# set the url for the site with the states
states_url = 'https://state.1keydata.com'

In [3]:
# make a connection to the page
states_page = requests.get(states_url, verify=False)



In [4]:
# verify the connection
states_page.status_code

200

In [5]:
# get the actual html content
states_readable = soupy(states_page.content, 'html')

In [6]:
# now we need to initialize a list to store the states
# we initialize a list with 50 none values
# this makes it much faster to insert the value into the list
list_of_states = [None] * 50

In [7]:
# now we find all the href tags 
subset_states_readable = states_readable.find('div', {'id' : 'colwrap'})
href_all = subset_states_readable.find_all('a', href=True)

In [8]:
# then we get the text for each of the a tag values
for index, state in enumerate(href_all):
    value = state.text
    value = value.replace(' ', '_')
    list_of_states[index] = value

In [9]:
# we use a list comprehension to create the urls
list_of_states = [['https://en.wikipedia.org/wiki/List_of_United_States_representatives_from_' + a, a] for a in list_of_states]

In [10]:
# temporary ssl fix
ssl._create_default_https_context = ssl._create_unverified_context

In [11]:
# initialize a list for our pandas dataframes
# this is more efficient than appending multiple data frames
list_of_data_frames = [None] * 50
# get each table as a pandas data frame
# store it in the list
for index, (url, state) in enumerate(list_of_states):
    sleep = 4
    try:
        table = pd.read_html(url)[0].iloc[:,[0,1,2,3]]
        table.rename({table.columns[0]: 'Representative'}, axis = 'columns', inplace = True)
        for i in range(1,4):
            if 'Democratic' in table.iloc[:,i].values:
                table.rename({table.columns[i]: 'Party'}, axis = 'columns', inplace = True)
            elif ('1st' or 'At-large') in table.iloc[:,i].values:
                table.rename({table.columns[i]: 'District'}, axis = 'columns', inplace = True)
            else:
                table.rename({table.columns[i]: 'Years'}, axis = 'columns', inplace = True)
        table['State'] = state
        table[['Start', 'End']] = table['Years'].str.split('–' or '-', expand = True)
        table = table.drop(columns = 'Years')
        table['Start'] = table['Start'].str.strip()
        table['End'] = table['End'].str.strip()
        table['Start'] = pd.to_datetime(table['Start'], errors='coerce')
        table['End'] = table['End'].replace(['Present', 'present'], 'November 28, 2021')
        table['End'] = pd.to_datetime(table['End'], errors='coerce')
        list_of_data_frames[index] = table
        time.sleep(random.uniform(0,sleep))
    except:
        time.sleep(random.uniform(0,sleep))
        pass

In [12]:
res = [i for i in range(len(list_of_data_frames)) if list_of_data_frames[i] is None]

In [13]:
# this is a list of indexes where we did not store a dataframe
# we will have to go back and scrape these values individually
res

[1, 7, 9, 10, 15, 18, 27, 49]

In [14]:
skel = pd.concat(list_of_data_frames, axis = 0, ignore_index = True)

In [15]:
skel = skel[skel['Start'].isnull() == False]

In [16]:
skel = skel[skel['End'].isnull() == False]

In [17]:
skel['Start_Year'] = skel['Start'].dt.year

In [18]:
skel['End_Year'] = skel['End'].dt.year

In [19]:
skel['Duration'] = skel['End'] - skel['Start']

In [20]:
skel['Duration_simple'] = pd.Series(skel['Duration']).dt.days/365

In [21]:
skel

Unnamed: 0,Representative,Party,District,State,Start,End,Start_Year,End_Year,Duration,Duration_simple
0,James Abercrombie,Whig,2nd,Alabama,1851-03-04,1855-03-04,1851,1855,1461 days,4.002740
1,John Abercrombie,Democratic,At-large,Alabama,1913-03-04,1917-03-04,1913,1917,1461 days,4.002740
2,Robert Aderholt,Republican,4th,Alabama,1997-01-03,2021-11-28,1997,2021,9095 days,24.917808
3,Truman H. Aldrich,Republican,9th,Alabama,1896-06-09,1897-03-04,1896,1897,268 days,0.734247
4,William F. Aldrich,Republican,4th,Alabama,1896-03-13,1897-03-04,1896,1897,356 days,0.975342
...,...,...,...,...,...,...,...,...,...,...
13859,Gardner R. Withrow,Republican,3rd,Wisconsin,1933-03-04,1935-01-03,1933,1935,670 days,1.835616
13860,Gardner R. Withrow,Progressive,3rd,Wisconsin,1935-01-03,1939-01-03,1935,1939,1461 days,4.002740
13861,Gardner R. Withrow,Republican,3rd,Wisconsin,1949-01-03,1961-01-03,1949,1961,4383 days,12.008219
13862,Gilbert M. Woodward,Democratic,7th,Wisconsin,1883-03-04,1885-03-03,1883,1885,730 days,2.000000


In [36]:
skel_2 = skel[skel['End_Year'] > 1950].copy(deep = True)

In [22]:
dict_1 = {}
year_init = 1789

In [23]:
for i in range(1, 117):
    dict_1[year_init] = i
    year_init = year_init + 2

In [24]:
# for year, congress in dict_1.items():
#     skel[str(congress)] = ""

In [25]:
# for index, row in skel.iterrows():
#     for year, congress in dict_1.items():
#         if row['Start'].year <= year:
#             if row['End'].year > (year + 1):
#                 skel.at[index,str(congress)] = 1
#             else:
#                 skel.at[index,str(congress)] = 0
#         else:
#             skel.at[index,str(congress)] = 0

In [26]:
years_congress_starts = dict_1.keys()

In [27]:
skel_restrict = skel.query('Start_Year in @years_congress_starts')

In [38]:
df = pd.DataFrame()

In [29]:
skel_2 = skel.copy(deep=True)

In [None]:
for index, row in skel_2.iterrows():
    df_each = pd.DataFrame()
    if row['Start_Year'] in years_congress_starts:
        while (row['Start_Year'] + 2) - row['End_Year'] > 0:
            df_each = df_each.append(row)
            row['Start_Year'] = row['Start_Year'] + 2
            df_each[-1:]['End_Year'] = df_each[-1:]['Start_Year'] + 2
        else:
            df_each = df_each.append(row)
        df = df.append(df_each)
    else:
        if (row['Start_Year'] + 1) - row['End_Year'] == 0:
            df_each = df_each.append(row)
        elif (row['Start_Year'] + 1) - row['End_Year'] > 0:
            df_each = df_each.append(row)
            row['Start_Year'] = row['Start_Year'] + 1
            while (row['Start_Year'] + 2) - row['End_Year'] > 0:
                df_each = df_each.append(row)
                row['Start_Year'] = row['Start_Year'] + 2
                df_each[-1:]['End_Year'] = df_each[-1:]['Start_Year'] + 2
            if (row['Start_Year'] + 2) - row['End_Year'] == 0:
                df_each = df_each.append(row)
        df = df.append(df_each)     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_each[-1:]['End_Year'] = df_each[-1:]['Start_Year'] + 2
