In [None]:
# Data Science Course Week 1 - Pandas and Data Manipulation

## We will be using real data sources to explore the features of Pandas

MovieLens 100k movie rating data:
    main page: http://grouplens.org/datasets/movielens/
    data dictionary: http://files.grouplens.org/datasets/movielens/ml-100k-README.txt
    files: u.user, u.data, u.item

WHO alcohol consumption data:
    article: http://fivethirtyeight.com/datalab/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/    
    original data: https://github.com/fivethirtyeight/data/tree/master/alcohol-consumption
    file: drinks.csv (with additional 'continent' column)

National UFO Reporting Center data:
    main page: http://www.nuforc.org/webreports.html
    file: ufo.csv


In [2]:
import pandas as pd

### Reading Files, Selecting Columns, and Summarizing

In [2]:
# can read a file from local computer or directly from a URL
pd.read_table('u.user', header=None)
#pd.read_table('https://raw.githubusercontent.com/alasdaird/SYD_DAT_6/master/labs/Week%201/u.user')

Unnamed: 0,0
0,1|24|M|technician|85711
1,2|53|F|other|94043
2,3|23|M|writer|32067
3,4|24|M|technician|43537
4,5|33|F|other|15213
5,6|42|M|executive|98101
6,7|57|M|administrator|91344
7,8|36|M|administrator|05201
8,9|29|M|student|01002
9,10|53|M|lawyer|90703


In [3]:
# read 'u.user' into 'users'
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('u.user', sep='|', header=None, names=user_cols, index_col='user_id', dtype={'zip_code':str})

In [4]:
# examine the users data
users                   # print the first 30 and last 30 rows
print users[:30]
print(users.tail(30))
type(users)             # DataFrame
print users.head(5)     # print the first 5 rows
print users.head(10)    # print the first 10 rows
print users.tail(5)     # print the last 5 rows
users.index             # "the index" (aka "the labels")
users.columns           # column names (which is "an index")
users.dtypes            # data types of each column
users.shape             # number of rows and columns
users.values            # underlying numpy array
users.info()            # concise summary (including memory usage)

         age gender     occupation zip_code
user_id                                    
1         24      M     technician    85711
2         53      F          other    94043
3         23      M         writer    32067
4         24      M     technician    43537
5         33      F          other    15213
6         42      M      executive    98101
7         57      M  administrator    91344
8         36      M  administrator    05201
9         29      M        student    01002
10        53      M         lawyer    90703
11        39      F          other    30329
12        28      F          other    06405
13        47      M       educator    29206
14        45      M      scientist    55106
15        49      F       educator    97301
16        21      M  entertainment    10309
17        30      M     programmer    06355
18        35      F          other    37212
19        40      M      librarian    02138
20        42      F      homemaker    95660
21        26      M         writ

In [5]:
# select a column
users['zip_code']       # select one column
print users['zip_code']   
#type(users['gender'])  # Series
#type(users[['gender']]) # DataFrame
frame = type(users[['zip_code']]) # select one column using the DataFrame attribute
print frame

user_id
1      85711
2      94043
3      32067
4      43537
5      15213
6      98101
7      91344
8      05201
9      01002
10     90703
11     30329
12     06405
13     29206
14     55106
15     97301
16     10309
17     06355
18     37212
19     02138
20     95660
21     30068
22     40206
23     48197
24     94533
25     55107
26     21044
27     30030
28     55369
29     94043
30     55436
       ...  
914    08105
915    60614
916    N2L5N
917    20006
918    70116
919    14216
920    90008
921    98801
922    21114
923    E2E3R
924    11753
925    49036
926    01701
927    55428
928    55408
929    53711
930    07310
931    33556
932    06437
933    48105
934    22902
935    66221
936    32789
937    98072
938    55038
939    33319
940    02215
941    97229
942    78209
943    77841
Name: zip_code, dtype: object
<class 'pandas.core.frame.DataFrame'>


In [6]:
# summarize (describe) the data
users.describe()                     # describe all numeric columns
print users.describe()        
users.describe(include=['object'])   # describe all object columns (can include multiple types)
print users.describe(include=['object'])
users.describe(include='all')        # describe all columns
print users.describe(include='all') 
users.gender.describe()              # describe a single column
print users.gender.describe()
users.age.mean()                     # only calculate the mean

              age
count  943.000000
mean    34.051962
std     12.192740
min      7.000000
25%     25.000000
50%     31.000000
75%     43.000000
max     73.000000
       gender occupation zip_code
count     943        943      943
unique      2         21      795
top         M    student    55414
freq      670        196        9
               age gender occupation zip_code
count   943.000000    943        943      943
unique         NaN      2         21      795
top            NaN      M    student    55414
freq           NaN    670        196        9
mean     34.051962    NaN        NaN      NaN
std      12.192740    NaN        NaN      NaN
min       7.000000    NaN        NaN      NaN
25%      25.000000    NaN        NaN      NaN
50%      31.000000    NaN        NaN      NaN
75%      43.000000    NaN        NaN      NaN
max      73.000000    NaN        NaN      NaN
count     943
unique      2
top         M
freq      670
Name: gender, dtype: object


34.05196182396607

In [7]:
# count the number of occurrences of each value
users.occupation.value_counts() # most useful for categorical variables
#users.age.value_counts()        # can also be used with numeric variables

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
salesman          12
lawyer            12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

#### Homework Week 1 (Wendy's solutions - i.e. output)


In [22]:
# for each occupation in 'users', count the number of occurrences

users.occupation.value_counts()


student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
salesman          12
lawyer            12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

In [8]:
# for each occupation, calculate the mean age
users.age.mean()
print users.age.mean()


34.051961824


In [11]:
# for each occupation, calculate the minimum and maximum ages
users.age.min()
print users.age.min()
users.age.max()
print users.age.max()

7
73


In [12]:
# for each combination of occupation and gender, calculate the mean age

users[users.gender.isin(['M'])] 
print users[(users.gender=='M')].age.mean()
users[users.gender.isin(['F'])] 
print users[(users.gender=='F')].age.mean()

34.1492537313
33.8131868132


In [1]:
# randomly sample a DataFrame

user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('u.user', sep='|', header=None, names=user_cols, index_col='user_id', dtype={'zip_code':str})
users = pd.read_table(np.random.rand(6,3),indexing=['user_id','age', 'gender', 'occupation', 'zip_code'])
users = np.arrange(6*3).reshape((6,3))
users = np.random.permutation(6)
users


NameError: name 'pd' is not defined

In [5]:
# detect duplicate users
import numpy as np
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('u.user', sep='|', header=None, names=user_cols, index_col='user_id', dtype={'zip_code':str})
#users = pd.read_table(np.random.rand(6,3),indexing=['user_id','user_id', 'gender', 'gender', 'zip_code','zip_code'])
users
users.ix['gender']


KeyError: 'gender'

## EXERCISE ONE

In [11]:
# read drinks.csv into a DataFrame called 'drinks'
drinks = pd.read_table('drinks.csv', sep=',')
drinks = pd.read_csv('drinks.csv')              # assumes separator is comma
drinks[0:30]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
5,Antigua & Barbuda,102,128,45,4.9,
6,Argentina,193,25,221,8.3,SA
7,Armenia,21,179,11,3.8,EU
8,Australia,261,72,212,10.4,OC
9,Austria,279,75,191,9.7,EU


In [12]:
# print the head and the tail
print drinks.head()
print drinks.tail()

       country  beer_servings  spirit_servings  wine_servings  \
0  Afghanistan              0                0              0   
1      Albania             89              132             54   
2      Algeria             25                0             14   
3      Andorra            245              138            312   
4       Angola            217               57             45   

   total_litres_of_pure_alcohol continent  
0                           0.0        AS  
1                           4.9        EU  
2                           0.7        AF  
3                          12.4        EU  
4                           5.9        AF  
       country  beer_servings  spirit_servings  wine_servings  \
188  Venezuela            333              100              3   
189    Vietnam            111                2              1   
190      Yemen              6                0              0   
191     Zambia             32               19              4   
192   Zimbabwe      

In [62]:
# examine the default index, data types, and shape
print drinks.index
print drinks.dtypes
print drinks.shape

RangeIndex(start=0, stop=193, step=1)
country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object
(193, 6)


In [9]:
# print the 'beer_servings' Series
type(drinks['beer_servings'])
print type(drinks['beer_servings'])
drinks.beer_servings

<class 'pandas.core.series.Series'>


0        0
1       89
2       25
3      245
4      217
5      102
6      193
7       21
8      261
9      279
10      21
11     122
12      42
13       0
14     143
15     142
16     295
17     263
18      34
19      23
20     167
21      76
22     173
23     245
24      31
25     231
26      25
27      88
28      37
29     144
      ... 
163    128
164     90
165    152
166    185
167      5
168      2
169     99
170    106
171      1
172     36
173     36
174    197
175     51
176     51
177     19
178      6
179     45
180    206
181     16
182    219
183     36
184    249
185    115
186     25
187     21
188    333
189    111
190      6
191     32
192     64
Name: beer_servings, dtype: int64

In [14]:
# calculate the average 'beer_servings' for the entire dataset
drinks.describe()                   # summarize all numeric columns
print drinks.describe()
drinks.beer_servings.describe()     # summarize only the 'beer_servings' Series
print drinks.beer_servings.describe() 
drinks.beer_servings.mean()         # only calculate the mean

       beer_servings  spirit_servings  wine_servings  \
count     193.000000       193.000000     193.000000   
mean      106.160622        80.994819      49.450777   
std       101.143103        88.284312      79.697598   
min         0.000000         0.000000       0.000000   
25%        20.000000         4.000000       1.000000   
50%        76.000000        56.000000       8.000000   
75%       188.000000       128.000000      59.000000   
max       376.000000       438.000000     370.000000   

       total_litres_of_pure_alcohol  
count                    193.000000  
mean                       4.717098  
std                        3.773298  
min                        0.000000  
25%                        1.300000  
50%                        4.200000  
75%                        7.200000  
max                       14.400000  
count    193.000000
mean     106.160622
std      101.143103
min        0.000000
25%       20.000000
50%       76.000000
75%      188.000000
max      376.

106.16062176165804

In [15]:
# count the number of occurrences of each 'continent' value and see if it looks correct
drinks.continent.value_counts()

AF    53
EU    45
AS    44
OC    16
SA    12
Name: continent, dtype: int64

#### Filtering and Sorting

using users data set again

In [16]:
# logical filtering with multiple conditions
users[(users.age < 20) & (users.gender=='M')]   print users[:30]
print(users.tail(30))
type(users)             # DataFrame
print users.head(5)     # print the first 5 rows
print users.head(10)    # print the first 10 rows
print users.tail(5)     # print the last 5 rows
users.index             # "the index" (aka "the labels")
users.columns           # column names (which is "an index")
users.dtypes            # data types of each column
users.shape             # number of rows and columns
users.values            # underlying numpy array
users.info()            # concise summary (including memory usage)    # ampersand for AND condition
users[(users.age < 20) | (users.age > 60)]          # pipe for OR condition
users[users.occupation.isin(['doctor', 'lawyer'])]  # alternative to multiple OR conditions

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,53,M,lawyer,90703
125,30,M,lawyer,22202
126,28,F,lawyer,20015
138,46,M,doctor,53211
161,50,M,lawyer,55104
205,47,M,lawyer,6371
251,28,M,doctor,85032
299,29,M,doctor,63108
339,35,M,lawyer,37901
365,29,M,lawyer,20009


In [17]:
# logical filtering: only show users with age < 20
young_bool = users.age < 20         # create a Series of booleans...
users[young_bool]                   # ...and use that Series to filter rows
print users[young_bool]  
users[users.age < 20]               # or, combine into a single step
users[users.age < 20].occupation    # select one column from the filtered results
print users[users.age < 20].occupation 
users[users.age < 20].occupation.value_counts() # value_counts of resulting Series
print users[users.age < 20].occupation.value_counts()

         age gender     occupation zip_code
user_id                                    
30         7      M        student    55436
36        19      F        student    93117
52        18      F        student    55105
57        16      M           none    84010
67        17      M        student    60402
68        19      M        student    22904
101       15      M        student    05146
110       19      M        student    77840
142       13      M          other    48118
179       15      M  entertainment    20755
206       14      F        student    53115
221       19      M        student    20685
223       19      F        student    47906
246       19      M        student    28734
257       17      M        student    77005
258       19      F        student    77801
262       19      F        student    78264
270       18      F        student    63119
281       15      F        student    06059
289       11      M           none    94619
291       19      M        stude

In [18]:
# sorting
users.age.sort_values()                       # sort a column
users.sort_values(by='age')                   # sort a DataFrame by a single column
users.sort_values(by='age', ascending=False)  # use descending order instead
users.sort_values(by=['occupation', 'age'])   # sort by multiple columns

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
118,21,M,administrator,90210
180,22,F,administrator,60202
282,22,M,administrator,20057
317,22,M,administrator,13210
439,23,F,administrator,20817
509,23,M,administrator,10011
394,25,M,administrator,96819
665,25,M,administrator,55412
726,25,F,administrator,80538
78,26,M,administrator,61801


## EXERCISE TWO

using the drinks dataset again

In [19]:
# filter DataFrame to only include European countries
drinks[drinks.continent.isin(['EU'])]                   # ...and use that Series to filter rows
drinks[(drinks.continent=='EU')].country.value_counts() # value_counts of resulting Series

Spain                 1
Serbia                1
Switzerland           1
Monaco                1
Ireland               1
Croatia               1
Latvia                1
Sweden                1
Malta                 1
Andorra               1
France                1
Luxembourg            1
Lithuania             1
Finland               1
Poland                1
Denmark               1
Slovakia              1
Czech Republic        1
Italy                 1
Georgia               1
Bulgaria              1
Ukraine               1
Norway                1
Bosnia-Herzegovina    1
Belgium               1
Cyprus                1
Macedonia             1
Hungary               1
Albania               1
Greece                1
Armenia               1
Germany               1
Austria               1
Slovenia              1
United Kingdom        1
San Marino            1
Azerbaijan            1
Montenegro            1
Iceland               1
Estonia               1
Netherlands           1
Belarus         

In [20]:
# filter DataFrame to only include European countries with wine_servings > 300
drinks[drinks.continent.isin(['EU'])] 
drinks[(drinks.continent=='EU')].country.value_counts() & drinks[drinks.wine_servings > 300].country.value_counts() # ...and use that Series to filter rows


Albania               False
Andorra                True
Armenia               False
Austria               False
Azerbaijan            False
Belarus               False
Belgium               False
Bosnia-Herzegovina    False
Bulgaria              False
Croatia               False
Cyprus                False
Czech Republic        False
Denmark               False
Estonia               False
Finland               False
France                 True
Georgia               False
Germany               False
Greece                False
Hungary               False
Iceland               False
Ireland               False
Italy                 False
Latvia                False
Lithuania             False
Luxembourg            False
Macedonia             False
Malta                 False
Moldova               False
Monaco                False
Montenegro            False
Netherlands           False
Norway                False
Poland                False
Portugal               True
Romania             

In [21]:
# calculate the average 'beer_servings' for all of Europe
drinks[drinks.continent.isin(['EU'])] 
drinks[(drinks.continent=='EU')].beer_servings.mean()

193.77777777777777

In [57]:
# determine which 10 countries have the highest amount of 'total_litres_of_pure_alcohol'

drinks.sort_values(by='total_litres_of_pure_alcohol',ascending=False).head(10)                           
 # The countries are Belarus, Lithuania,Andorra,Grenada,Czech Republic, France, Russian Federation, Ireland, Slovakia and Luxembourg

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
15,Belarus,142,373,42,14.4,EU
98,Lithuania,343,244,56,12.9,EU
3,Andorra,245,138,312,12.4,EU
68,Grenada,199,438,28,11.9,
45,Czech Republic,361,170,134,11.8,EU
61,France,127,151,370,11.8,EU
141,Russian Federation,247,326,73,11.5,AS
81,Ireland,313,118,165,11.4,EU
155,Slovakia,196,293,116,11.4,EU
99,Luxembourg,236,133,271,11.4,EU


In [103]:
# rename the column 'beer_servings' to 'beer'
drinks = pd.read_table('drinks.csv', sep=',')
drinks = pd.read_csv('drinks.csv')              # assumes separator is comma
drinks.rename(columns={'beer_servings':'beer'})

Unnamed: 0,country,beer,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
5,Antigua & Barbuda,102,128,45,4.9,
6,Argentina,193,25,221,8.3,SA
7,Armenia,21,179,11,3.8,EU
8,Australia,261,72,212,10.4,OC
9,Austria,279,75,191,9.7,EU


In [118]:
# add a new column as a function of  existing column, total_servings = beer + wine + spirits

drinks = pd.read_table('drinks.csv', sep=',')
drinks = pd.read_csv('drinks.csv')              
drinks.rename(columns={'beer_servings':'beer','wine_servings':'wine','spirit_servings':'spirits'})
# I tried this function below but it did not work
drinks['total_servings'] = drinks['wine' + 'spirits' + 'beer']
drinks


Unnamed: 0,country,beer,spirits,wine,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
5,Antigua & Barbuda,102,128,45,4.9,
6,Argentina,193,25,221,8.3,SA
7,Armenia,21,179,11,3.8,EU
8,Australia,261,72,212,10.4,OC
9,Austria,279,75,191,9.7,EU


In [119]:
# remove the column you just added

drinks = pd.read_table('drinks.csv', sep=',')
drinks = pd.read_csv('drinks.csv')              
drinks.rename(columns={'beer_servings':'beer','wine_servings':'wine','spirit_servings':'spirits'})
drinks['total_servings'] = drinks['wine' + 'spirits' + 'beer']
drinks.drop(labels=['total_servings'],axis=1, inplace=True)
drinks


KeyError: 'winespiritsbeer'

### Handling Missing Values

In [93]:
# missing values are usually excluded by default
drinks.continent.value_counts()              # excludes missing values
drinks.continent.value_counts(dropna=False)  # includes missing values

AF     53
EU     45
AS     44
NaN    23
OC     16
SA     12
Name: continent, dtype: int64

In [92]:
# find missing values in a Series
drinks.continent.isnull()           # True if missing, False if not missing
drinks.continent.isnull().sum()     # count the missing values
drinks.continent.notnull()          # True if not missing, False if missing
drinks[drinks.continent.notnull()]  # only show rows where continent is not missing

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
6,Argentina,193,25,221,8.3,SA
7,Armenia,21,179,11,3.8,EU
8,Australia,261,72,212,10.4,OC
9,Austria,279,75,191,9.7,EU
10,Azerbaijan,21,46,5,1.3,EU


In [94]:
# use 'tilde' ~ to negate the boolean values
~drinks.continent.isnull()  

0       True
1       True
2       True
3       True
4       True
5      False
6       True
7       True
8       True
9       True
10      True
11     False
12      True
13      True
14     False
15      True
16      True
17     False
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
       ...  
163     True
164     True
165     True
166     True
167     True
168     True
169     True
170     True
171     True
172     True
173     True
174    False
175     True
176     True
177     True
178     True
179     True
180     True
181     True
182     True
183     True
184    False
185     True
186     True
187     True
188     True
189     True
190     True
191     True
192     True
Name: continent, dtype: bool

In [95]:
# side note: understanding axes
drinks.sum(axis=0)      # sums "down" the 0 axis (rows)
drinks.sum()            # axis=0 is the default
drinks.sum(axis=1)      # sums "across" the 1 axis (columns)

0        0.0
1      279.9
2       39.7
3      707.4
4      324.9
5      279.9
6      447.3
7      214.8
8      555.4
9      554.7
10      73.3
11     355.3
12     114.0
13       0.0
14     358.3
15     571.4
16     601.5
17     391.8
18      52.1
19      23.4
20     219.8
21     261.6
22     248.4
23     413.2
24      34.6
25     587.3
26      43.3
27      94.3
28      49.0
29     220.0
       ...  
163    318.6
164     98.7
165    405.2
166    575.2
167     57.0
168     17.3
169    364.4
170    222.9
171      6.1
172     58.3
173     63.1
174    366.4
175     75.3
176     81.4
177    124.2
178     57.0
179     62.3
180    496.9
181    158.8
182    550.4
183     48.7
184    499.7
185    376.6
186    136.4
187     50.9
188    443.7
189    116.0
190      6.1
191     57.5
192     90.7
dtype: float64

In [96]:
# find missing values in a DataFrame
drinks.isnull()             # DataFrame of booleans
drinks.isnull().sum()       # count the missing values in each column

country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64

In [101]:
# fill in missing values
drinks.continent.fillna(value='NA')                 # fill in missing values with 'NA'
drinks.continent.fillna(value='NA', inplace=True)   # modifies 'drinks' in-place

### Merging Data

In [1]:
# read 'u.item' into 'movies'
movie_cols = ['movie_id', 'title']
movies = pd.read_table('u.item', sep='|', header=None, names=movie_cols, usecols=[0, 1])


NameError: name 'pd' is not defined

In [121]:
# read 'u.data' into 'ratings'
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('u.data', sep='\t', header=None, names=rating_cols)

In [123]:
# merge 'movies' and 'ratings' (inner join on 'movie_id')
movie_ratings = pd.merge(movies, ratings,on='movie_id')
movies.shape
ratings.shape
movie_ratings.shape

(100000, 5)