# Making initial package imports for scraping.

In [2]:
# Importing useful modules from Pandas 
from pandas import Series
import pandas as pd
from pandas import DataFrame



## Scraping data from Movielens website

http://grouplens.org/datasets/movielens/

### Example inspired by Greg Reda

## Reading User Data

In [78]:
# passing column header names for the CSV file

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

#using pandas read_csv function to read data from csv file into a dataframe 
users = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.user', sep = '|', names = u_cols)

users.head() # only displaying the first five values of the imported dataframe


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## Reading Ratings Data

In [79]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', sep = '\t', names = r_cols)

ratings.head(3)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


## Reading Movie Data

In [80]:
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']

movies = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.item', sep = '|',names = m_cols, usecols=range(5))

print movies.head()
print movies.describe() # checking the mathematical functions on the data
movies.dtypes # inquiring the datatype of each feature

   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            imdb_url  
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...  
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...  
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...  
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...  
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)  
          movie_id  video_release_date
count  1682.000000                   0
mean    841.500000                 NaN
std     485.695893                 NaN
min       1.000000                 NaN
25%     421.250000                 NaN
50%     841.500000               

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object

## Trying out some random pandas functions on the data before analysis

In [81]:
users['occupation'].head()
multi_cols = ['occupation', 'sex'] # joining multiple column together
print users[multi_cols].head()

print "\n Filter \n"
# applying filters on the data
oldUsers = users[(users.age > 45) & (users.sex == 'M')]
print oldUsers.head()

# statistical summary of the data in the dataframe
print "\n Summary \n"
print users.age.describe() # summarises certain common mathematical function such as mean, sd, percentiles, etc



   occupation sex
0  technician   M
1       other   F
2      writer   M
3  technician   M
4       other   F

 Filter 

    user_id  age sex     occupation zip_code
6         7   57   M  administrator    91344
9        10   53   M         lawyer    90703
12       13   47   M       educator    29206
25       26   49   M       engineer    21044
46       47   53   M      marketing    07102

 Summary 

count    943.000000
mean      34.051962
std       12.192740
min        7.000000
25%       25.000000
50%       31.000000
75%       43.000000
max       73.000000
dtype: float64


## Grouping data


### Lets see the average ratings per user

In [82]:
ratings.head()
ratings[ratings.user_id == 196]

grouped_data = ratings['movie_id'].groupby(ratings.user_id)
print grouped_data.count().head()


## Average rating per movie

grouped_data_avg = ratings['rating'].groupby(ratings['movie_id'])
average_rating = grouped_data_avg.mean()
print average_rating.head()

user_id
1          272
2           62
3           54
4           24
5          175
Name: movie_id, dtype: int64
movie_id
1           3.878319
2           3.206107
3           3.033333
4           3.550239
5           3.302326
Name: rating, dtype: float64


In [83]:
# Getting the titles for the movies with highest ratings
max_rating = average_rating.max()
good_movie_ids = average_rating[average_rating == max_rating].index
print "\n Best movies are"
movies[movies.movie_id.isin(good_movie_ids)].title 

# Checking the number of ratings for the highest ranked movies
high_ranked_movies = grouped_data_avg.count()
print "\n Number of ratings per movie"
print high_ranked_movies[average_rating == max_rating]



 Best movies are

 Number of ratings per movie
movie_id
814         1
1122        1
1189        3
1201        1
1293        3
1467        2
1500        2
1536        1
1599        1
1653        1
Name: rating, dtype: int64
