# Introduction to Python and Natural Language Technologies

__Laboratory 05, Data science, Handling text data, Pandas__

__March 11, 2021__

__Judit Ács, Ádám Kovács__


In [None]:
import pandas as pd  # by convention we use the pd alias
%matplotlib inline
import matplotlib
import numpy as np

# make our charts pretier
matplotlib.style.use('ggplot')
matplotlib.pyplot.rcParams['figure.figsize'] = (15, 3)
matplotlib.pyplot.rcParams['font.family'] = 'sans-serif'

We are going to work with the MovieLens dataset. First, we need to download it. http://grouplens.org/datasets/movielens/

Download the file only if it wasn't before.

In [None]:
import os

data_dir = os.getenv("MOVIELENS")
if data_dir is None:
    data_dir = ""

ml_path = os.path.join(data_dir, "ml.zip")

if not os.path.exists(ml_path):
    print("Download data")
    import urllib
    u = urllib.request.URLopener()
    u.retrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", ml_path)
    print("Data downloaded")

In [None]:
unzip_path = os.path.join(data_dir, "ml-100k")

if not os.path.exists(unzip_path):
    print("Extracting data")
    from zipfile import ZipFile
    with ZipFile(ml_path) as myzip:
        myzip.extractall(data_dir)
    print("Data extraction done")
        
data_dir = unzip_path

## Importing and preprocessing the data

In [None]:
column_names = [
    "movie_id", "title", "release_date", "video_release_date", "imdb_url", "unknown", "action", "adventure", "animation",
    "children", "comedy", "crime", "documentary", "drama", "fantasy", "film_noir", "horror", "musical", "mystery",
    "romance", "sci_fi", "thriller", "war", "western"]

df = pd.read_table(
    os.path.join(data_dir, "u.item"), sep="|",
    names=column_names, encoding="latin1", index_col='movie_id')
df.head()

We have two columns representing dates: release_date, video_release_date. Pandas can parse dates in multiple ways, we only need to give the columns in the parse_dates parameter. Note that null values changed from Nan (not a number) to NaT (not a time).

In [None]:
df = pd.read_table(os.path.join(data_dir, "u.item"), sep="|",
                   names=column_names, encoding="latin1",
                   parse_dates=[2,3], index_col='movie_id')
df.head()

Our solution is still not perfect, because after the title of the movies, the date is present in parentheses. We need to get rid of them, because this information is redundant and generates noise in the DataFrame.

Regular expressions can be used to delete the date between the parentheses. After we delete the remaining whitespaces as well (the strip function deletes the whitespaces in the beginning and in the end of the string as well). Finally we can use our new stripped title instead of the old one.

In [None]:
df.title = df.title.str.replace(r'\(.*\)', '').str.strip()

The video_release_date attribute contains invalid information in the first fields. We need to check that it contains valid information before using it. We can list those fields where the video_release value is not NaT (not invalid value).

In [None]:
df[df.video_release_date.notnull()]

In [None]:
df = df.drop('video_release_date', axis=1)
df.head()

In [None]:
df = df.drop('unknown', axis=1)

df['year'] = df.release_date.dt.year

# 1. Simple queries

## 1.1. How many action movies were released before 1985? How many were released in 1985 or after 1985?

In [None]:
def count_movies_before_1985(df):
    # YOUR CODE HERE
    raise NotImplementedError()
    
def count_movies_after_1984(df):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
before = count_movies_before_1985(df)
print(before)
assert type(before) == int

after = count_movies_after_1984(df)
print(after)
assert type(after) == int

## 1.2. Is there a thriller movie for children? Search for an example and retrieve the title of the movie.

In [None]:
def child_thriller(df):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
title = child_thriller(df)
assert type(title) == str

## 1.3. How many movies have longer title than 30 character?

In [None]:
def long_titles(df):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
title_cnt = long_titles(df)
assert type(title_cnt) == int

# Task 2: grouping and visualization

## 2.1. Group the comedies by year. Visualize how many comedies were relesead annually in a bar chart.

In [None]:
def comedy_by_year(df):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
c = comedy_by_year(df)
assert type(c) == pd.core.groupby.DataFrameGroupBy

## 2.2. Visualize the distribution of the release days (days of the month) in a pie chart! 

In [None]:
def groupby_release_day(df):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
by_day = groupby_release_day(df)
assert type(by_day) == pd.core.groupby.DataFrameGroupBy

# a month is 32 days at most
assert len(by_day) < 32

# don't group by the weekdays
assert len(by_day) > 7

## 2.3. We want to make a traditional lexicon from the movies. Count the starting letters in the titles of the movies. Visualize it in a pie chart.

### Group the movies by the starting letters.

In [None]:
def groupby_initial_letter(df):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
initial = groupby_initial_letter(df)

assert type(initial) == pd.core.groupby.DataFrameGroupBy

# Handling multiple dataframes, e.g.merge

In [None]:
cols = ['user', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(os.path.join(data_dir, "u.data"), names=cols)

In [None]:
ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')
ratings.head()

In [None]:
movies = df

In [None]:
ratings = pd.merge(ratings, movies, left_on='movie_id', right_index=True)
ratings.head()

# Task 3: merge

## Q3.1. How many movies got a rating greater than 4 atleast once?

WARNING! The title of a movie is not necessarily unique!

In [None]:
def count_greater_than_4(ratings):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
greater = count_greater_than_4(ratings)

assert type(greater) == int
assert greater != 1160  # titles are NOT UNIQUE

## Q3.2. All of the ratings contain a timestamp. What is the average of the ratings by the weekdays?
What are the days when the people are kinder?

Hint: look at the `dt` namespace (dt.weekday).

In [None]:
def rating_mean_by_weekday(ratings):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
weekday_mean = rating_mean_by_weekday(ratings)

assert type(weekday_mean) == pd.Series
assert type(weekday_mean) != pd.DataFrame  # only one column is needed

# ===================== PASSING LEVEL ======================

## \*2. Write function, that groups by multiple columns and returns the biggest group.

Hint: the `GroupBy` object's `get_group` function returns a group.

In [None]:
def get_largest_group(df, groupby_columns):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
genres = ["drama"]
drama_largest = get_largest_group(df, genres)

assert type(drama_largest) == pd.DataFrame
assert len(drama_largest) == 957

genres = ["drama", "comedy"]
both_largest = get_largest_group(df, genres)

# the labels are the same in the comedies and dramas
assert both_largest[["comedy", "drama"]].nunique().loc["comedy"] == 1
assert both_largest[["comedy", "drama"]].nunique().loc["drama"] == 1
print(both_largest.shape)

# Task 4: Users DataFrame

## 4.1 Read in the u.user file into a DataFrame named `users`.

The columns: `user_id, age, gender, occupation, zip`. The column `user_id` should be the id of the `DataFrame`.

In [None]:
# users = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert type(users) == pd.DataFrame

# user_id starts from 1
assert 0 not in users.index

In [None]:
# ratings = ratings.merge...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert type(ratings) == pd.DataFrame
assert ratings.shape == (100000, 30)

## 4.2. What is the count of the ratings given by people with occupation pgrogrammer and marketing? Visualize it by the hours of a day (24h). 

Hint:

1. use the `timestamp` field from the rating table,
1. you can use two cells for the solution,
1. how many slice the pie chart will have.

Implement a function that returns a given occupation's ratings by hour.

In [None]:
def occupation_cnt_by_hour(ratings, occupation):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
marketing = occupation_cnt_by_hour(ratings, "marketing")
assert type(marketing) == pd.Series

# 24h is in a day
assert len(marketing) < 25

###  Visualize the marketers and the programmers rating hours.

First the marketers:

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
programmer = occupation_cnt_by_hour(ratings, "programmer")

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

# ===================== EXTRA LEVEL ======================