In [1]:
# to scrape data from wikipedia, we need install the package called lxml
# we can do that here from our notebook or, if we think we will use it often, we could modify our "install_packages"
# shell script to install it automatically each time we start a job in UCloud
%pip install lxml

# import pandas so we can put data in a nice dataframe
# we'll abbreviate pandas as pd, because that's what everybody does
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable
Collecting lxml
  Downloading lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[K     |████████████████████████████████| 8.1 MB 1.8 MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.3.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Scraping data from the web
Using `pandas.read_html`, we can read data from websites where data is presented in a table-like format. Wikipedia has lots of these, and is a great source for data to play with. Below, we'll look at data from the [List of Sesame Street Muppets](https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets)

In [2]:
# scrape table data from websites

rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets")
df = rawdata[1]
df

Unnamed: 0,Character,Actor/Muppet performer,Description,Unnamed: 3
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...,
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn...",
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br...",
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri...",
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...,Writer Christopher Finch called Anything Muppe...
...,...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i...",
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ...",
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally...",
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...,


In [3]:
list(df)

['Character', 'Actor/Muppet performer', 'Description', 'Unnamed: 3']

## Removing an unwanted column
Below are several ways to get rid of the final column (there are more ways to do this!) If you find the "axis = 1" part in the first method confusing, well, I do too, and [we are not alone](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)! My advice for now is to just accept it and move on. Probably the easiest method is to use `pop` (method 3)

In [None]:

# method 1
df = df.drop(['Unnamed: 3'], axis = 1)
df

In [None]:

# method 2
df = df.loc[:,list(df)[0:3]] 
df

In [4]:
# method 3
df.pop('Unnamed: 3')
df

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [9]:
# take a column from a dataframe and assign it to a list variable
character_list = df['Character'].tolist()

In [23]:
# find the first four items in the list
print(character_list[:4])

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford']


In [22]:
# find the last three items in the list
print(character_list[-4:])

['The Twiddlebugs', 'The Two-Headed Monster', 'Wes', 'Zoe']


In [20]:
# find items in the middle of the list
len(character_list)

# characterlist is 120 (python = 119) items long, so printing some values around 60
print(character_list[57:63])

['Herry Monster', 'Honkers', 'Hoots', 'Horatio', 'Humphrey', 'Ingrid']


In [26]:
# select the first 20 items from a list
character_list_20 = character_list[:20]

20

In [31]:
# remove the last item in a list
character_list_20 = character_list_20[:19]

In [32]:
# inpsect the list to make sure the last item was removed
print(character_list_20)

# 19 items in the list, so last item was removed

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno']


In [35]:
# remove a specific item from the list
# i want to remove "Baby Bear" from the list

character_list_20.remove("Baby Bear")


ValueError: list.remove(x): x not in list

In [36]:

if "Baby Bear" in character_list_20:
    print("Baby Bear was not removed")
else:
    print("successful")

successful


In [37]:
# stick a Kermit on the end of the list
character_list_20.append("Kermit")

In [38]:
# inspect the list to make sure Kermit was added
print(character_list_20)

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno', 'Kermit']


In [44]:
# insert an item into a list at a particular position
character_list_20.insert(4, "AnnaBanana")



In [45]:
print(character_list_20)

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'AnnaBanana', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno', 'Kermit']


In [47]:
# replace an item in a list
character_list_20[1] = "Abra Cadabra"

In [48]:
# make a new list which adds "is a cute monster" to each item in the list
for i in range(len(character_list_20)):
    character_list_20[i] += " is a cute monster"

In [49]:
print(character_list_20)

['Abby Cadabbyis a cute monster', 'Abra Cadabrais a cute monster', 'Alistair Cookieis a cute monster', 'The Amazing Mumfordis a cute monster', 'AnnaBananais a cute monster', 'Anything Muppetsis a cute monster', 'AM Monstersis a cute monster', 'Aristotleis a cute monster', 'Arlene Franticis a cute monster', 'Barkleyis a cute monster', 'Beautiful Day Monster[broken anchor]is a cute monster', 'Bennett Snerfis a cute monster', 'Bennyis a cute monster', 'Bertis a cute monster', 'Betty Louis a cute monster', 'Biffis a cute monster', 'Big Birdis a cute monster', 'Bip Bippadottais a cute monster', 'Brunois a cute monster', 'Kermitis a cute monster']


In [50]:
# reset list b to original first 20 items from list a
character_list_20 = character_list[:20]

In [52]:
# make a list of your favorite monsters, and then make a new list which only includes the monsters 
# from list b that are also in your favorites list
my_favourite_monsters = ["Bennett Snerf", "Big Bird", "Mad Monty", "Piano", "Thudge McGerk"]

common_monsters = [item for item in character_list_20 if item in my_favourite_monsters]

print(common_monsters)

['Bennett Snerf', 'Big Bird']


In [53]:
# make a new list which includes the monsters from list b that are not in your favorites list
uncommon_monsters = [item for item in character_list_20 if item not in my_favourite_monsters]

print(uncommon_monsters)



['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Bip Bippadotta', 'Bruno', 'Buster']


## More fun with lists

In [None]:
# add 10 to each number in d

In [None]:
# divide each number in d by 2

## Dataframe manipulation

In [None]:
# download the student sleep data
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/ExPsyLing/master/Data/StudentSleep.csv")
df

In [None]:
# find the number of rows and columns in the dataframe


In [None]:
# make a new dataframe df1 which only includes the first 4 rows of the original dataframe


In [None]:
# make another new dataframe df2 which only includes rows 5 through the end of the original dataframe


In [None]:
# make a third dataframe df3 with df2 on top of df1 (hint: use pd.concat)


In [None]:
# overwrite df3 with df1 and df2 back in their original order


In [None]:
# make a new column called "average" which is the mean of the other columns for each row
df3['average'] = df3.mean(axis = 1)
df3

In [None]:
# remove the average column from the dataframe
df3.pop('average')
df3

In [None]:
# make a list of the means of each column in the dataframe
column_means = list(df3.mean())
column_means

In [None]:
# make a list of the means of each row in the dataframe
colnames = list(df3)
colnames

In [None]:
list(zip(colnames, column_means))

In [None]:
student_means = dict(zip(colnames, column_means))
student_means

In [None]:
print('Student 4\'s average:', round(student_means['Student 4'], 3))

In [None]:
# make a dataframe the mean hours of sleep for each student
df_means = pd.DataFrame(zip(colnames, column_means))
df_means.columns = ['Students', 'Sleep Hours']
df_means

In [None]:
# transpose the dataframe
df_transposed = df3.transpose()
df_transposed

In [None]:
colnames = list(df_transposed)
colnames

In [None]:
newcols = ['Day ' + str(x+1) for x in colnames]
newcols

In [None]:
df_transposed.columns = newcols
df_transposed

In [None]:
df_transposed.index.name = 'student'
df_transposed

In [None]:
df_transposed.reset_index(inplace = True)
df_transposed

In [None]:
df_long = pd.melt(df_transposed, id_vars = 'student')
df_long