# Data Science Course Week 1 - Pandas and Data Manipulation

## We will be using real data sources to explore the features of Pandas

MovieLens 100k movie rating data:
    main page: http://grouplens.org/datasets/movielens/
    data dictionary: http://files.grouplens.org/datasets/movielens/ml-100k-README.txt
    files: u.user, u.data, u.item

WHO alcohol consumption data:
    article: http://fivethirtyeight.com/datalab/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/    
    original data: https://github.com/fivethirtyeight/data/tree/master/alcohol-consumption
    file: drinks.csv (with additional 'continent' column)

National UFO Reporting Center data:
    main page: http://www.nuforc.org/webreports.html
    file: ufo.csv


In [1]:
import pandas as pd

### Reading Files, Selecting Columns, and Summarizing

In [15]:
# can read a file from local computer or directly from a URL
pd.read_table('u.user', header=None)
#pd.read_table('https://raw.githubusercontent.com/alasdaird/SYD_DAT_6/master/labs/Week%201/u.user')

Unnamed: 0,0
0,1|24|M|technician|85711
1,2|53|F|other|94043
2,3|23|M|writer|32067
3,4|24|M|technician|43537
4,5|33|F|other|15213
5,6|42|M|executive|98101
6,7|57|M|administrator|91344
7,8|36|M|administrator|05201
8,9|29|M|student|01002
9,10|53|M|lawyer|90703


In [14]:
# read 'u.user' into 'users'
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('u.user', sep='|', header=None, names=user_cols, index_col='user_id', dtype={'zip_code':str})

In [23]:
# examine the users data
#users                   # print the first 30 and last 30 rows
#type(users)             # DataFrame
#users.head()            # print the first 5 rows
users.head(10)          # print the first 10 rows
#users.tail()            # print the last 5 rows
#users.index             # "the index" (aka "the labels")
#users.columns           # column names (which is "an index")
#users.dtypes            # data types of each column
#users.shape             # number of rows and columns
#users.values            # underlying numpy array
#users.info()            # concise summary (including memory usage)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


In [31]:
# select a column
#users['gender']         # select one column
type(users['gender'])   # Series
#type(users[['gender']])   # DataFrame
#users.gender            # select one column using the DataFrame attribute


pandas.core.series.Series

In [34]:
# summarize (describe) the data
#users.describe()                    # describe all numeric columns
#users.describe(include=['object'])  # describe all object columns (can include multiple types)
#users.describe(include='all')       # describe all columns
users.gender.describe()             # describe a single column
#users.age.mean()                    # only calculate the mean


count     943
unique      2
top         M
freq      670
Name: gender, dtype: object

In [35]:
# count the number of occurrences of each value
users.occupation.value_counts()     # most useful for categorical variables
#users.age.value_counts()        # can also be used with numeric variables

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
salesman          12
lawyer            12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

## EXERCISE ONE

In [47]:
# read drinks.csv into a DataFrame called 'drinks'
drinks = pd.read_table('drinks.csv', sep=',')
#drinks = pd.read_csv('drinks.csv')              # assumes separator is comma

In [48]:
# print the head and the tail
#drinks.head()
#drinks.tail()
drinks.describe(include=head)

NameError: name 'head' is not defined

In [43]:
# examine the default index, data types, and shape
drinks.index
#drinks.dtypes
#drinks.shape

RangeIndex(start=0, stop=193, step=1)

In [45]:
# print the 'beer_servings' Series
drinks['beer_servings']
#drinks.beer_servings


0        0
1       89
2       25
3      245
4      217
5      102
6      193
7       21
8      261
9      279
10      21
11     122
12      42
13       0
14     143
15     142
16     295
17     263
18      34
19      23
20     167
21      76
22     173
23     245
24      31
25     231
26      25
27      88
28      37
29     144
      ... 
163    128
164     90
165    152
166    185
167      5
168      2
169     99
170    106
171      1
172     36
173     36
174    197
175     51
176     51
177     19
178      6
179     45
180    206
181     16
182    219
183     36
184    249
185    115
186     25
187     21
188    333
189    111
190      6
191     32
192     64
Name: beer_servings, dtype: int64

In [46]:
# calculate the average 'beer_servings' for the entire dataset
drinks.describe()                   # summarize all numeric columns
#drinks.beer_servings.describe()     # summarize only the 'beer_servings' Series
#drinks.beer_servings.mean()         # only calculate the mean


Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [49]:

# count the number of occurrences of each 'continent' value and see if it looks correct
drinks.continent.value_counts()


AF    53
EU    45
AS    44
OC    16
SA    12
Name: continent, dtype: int64

#### Filtering and Sorting

using users data set again

In [58]:
# logical filtering: only show users with age < 20
#young_bool = users.age < 30         # create a Series of booleans...
#users[young_bool]                   # ...and use that Series to filter rows
users[users.age < 20]               # or, combine into a single step
#users[users.age < 20].occupation    # select one column from the filtered results
#users[users.age < 20].occupation.value_counts()     # value_counts of resulting Series

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,7,M,student,55436
36,19,F,student,93117
52,18,F,student,55105
57,16,M,none,84010
67,17,M,student,60402
68,19,M,student,22904
101,15,M,student,05146
110,19,M,student,77840
142,13,M,other,48118
179,15,M,entertainment,20755


In [63]:
# logical filtering with multiple conditions
users[(users.age < 15) & (users.gender=='F') & (users.occupation=='student')]       # ampersand for AND condition
#users[(users.age < 20) | (users.age > 60)]          # pipe for OR condition
#users[users.occupation.isin(['doctor', 'lawyer'])]  # alternative to multiple OR conditions


Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
206,14,F,student,53115
609,13,F,student,55106
674,13,F,student,55337
813,14,F,student,2136
887,14,F,student,27249


In [71]:
# sorting
#users.age.sort_values()                   # sort a column
#users.sort_values(by='age')                   # sort a DataFrame by a single column
#users.sort_values(by='age', ascending=True)  # use descending order instead
users.sort_values(by=['occupation', 'age', 'zip_code'])   # sort by multiple columns


Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
118,21,M,administrator,90210
317,22,M,administrator,13210
282,22,M,administrator,20057
180,22,F,administrator,60202
509,23,M,administrator,10011
439,23,F,administrator,20817
665,25,M,administrator,55412
726,25,F,administrator,80538
394,25,M,administrator,96819
86,26,M,administrator,46005


## EXERCISE TWO

using the drinks dataset again

In [None]:
# filter DataFrame to only include European countries


In [None]:
# filter DataFrame to only include European countries with wine_servings > 300

In [None]:
# calculate the average 'beer_servings' for all of Europe

In [None]:
# determine which 10 countries have the highest total_litres_of_pure_alcohol

In [None]:
# rename the column 'beer_servings' to 'beer'

In [None]:
# add a new column as a function of existing columns, total_servings = beer + wine + spirits

In [None]:
# remove the column you just added

### Handling Missing Values

In [None]:
# missing values are usually excluded by default
drinks.continent.value_counts()              # excludes missing values
drinks.continent.value_counts(dropna=False)  # includes missing values

In [None]:
# find missing values in a Series
drinks.continent.isnull()           # True if missing, False if not missing
drinks.continent.isnull().sum()     # count the missing values
drinks.continent.notnull()          # True if not missing, False if missing
drinks[drinks.continent.notnull()]  # only show rows where continent is not missing

In [None]:
# use 'tilde' ~ to negate the boolean values
~drinks.continent.isnull()  

In [None]:
# side note: understanding axes
drinks.sum(axis=0)      # sums "down" the 0 axis (rows)
drinks.sum()            # axis=0 is the default
drinks.sum(axis=1)      # sums "across" the 1 axis (columns)

In [None]:
# find missing values in a DataFrame
drinks.isnull()             # DataFrame of booleans
drinks.isnull().sum()       # count the missing values in each column

In [None]:
# fill in missing values
drinks.continent.fillna(value='NA')                 # fill in missing values with 'NA'
drinks.continent.fillna(value='NA', inplace=True)   # modifies 'drinks' in-place

### Merging Data

In [None]:
# read 'u.item' into 'movies'
movie_cols = ['movie_id', 'title']
movies = pd.read_table('u.item', sep='|', header=None, names=movie_cols, usecols=[0, 1])


In [None]:
# read 'u.data' into 'ratings'
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('u.data', sep='\t', header=None, names=rating_cols)

In [None]:
# merge 'movies' and 'ratings' (inner join on 'movie_id')
movie_ratings = pd.merge(movies, ratings)
movies.shape
ratings.shape
movie_ratings.shape