# Transform Load

Combine and preprocess data from multiple sources and store in parquet format

In [1]:
# Libraries used
import pandas as pd

# t_movies

In [2]:
movies = pd.read_csv('staging/movies.csv', header=None)

In [3]:
# Set column and index names
movies.columns = ['title', 'rating', 'popularity']
movies.index = range(1,251)
movies.index.name = 'movie_id'

# Set column types for 'title' and 'rating'
movies['title'] = movies['title'].astype('string')
movies['rating'] = movies['rating'].astype('float')

# Get rid of commas ',' in the 'popularity' data, and set column type to int
movies['popularity'] = pd.Series(map(lambda x: int(x.replace(',','')), movies['popularity']), index=movies.index)

In [4]:
print(movies)
print('\nSCHEMA of movies\n')
print(movies.info())

                             title  rating  popularity
movie_id                                              
1         The Shawshank Redemption     9.3          71
2                    The Godfather     9.2          68
3                  The Dark Knight     9.0         107
4            The Godfather Part II     9.0         245
5                     12 Angry Men     9.0         276
...                            ...     ...         ...
246                       The Help     8.1          97
247                    Dersu Uzala     8.2          97
248                        Aladdin     8.0        1046
249             Dances with Wolves     8.0         924
250                         Gandhi     8.0        1809

[250 rows x 3 columns]

SCHEMA of movies

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 1 to 250
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       250 non-null    string 
 1   rating    

**Store Parquet**

In [5]:
movies.to_parquet('master/t_movies.parquet', index=True)

# t_genres

In [6]:
genres = pd.read_csv('staging/genres.csv', header=None)

In [7]:
# Create table and set column name for genres
genres = pd.DataFrame({'genre': [l for l in genres.iloc[:,0]]})

# Set index name
genres.index = range(1,251)
genres.index.name = 'movie_id'

# Transform column 'genre' from string to list
genres['genre'] = pd.Series(map(lambda x : x.strip('\'][\'').split('\', \''), genres['genre']), index=genres.index)

# Unpack values from the column 'genres'
genres = genres.explode('genre')

# Set 'genre' column type to string
genres['genre'] = genres['genre'].astype('string')

In [8]:
print(genres)
print('\nSCHEMA of genres\n')
print(genres.info())

              genre
movie_id           
1             Drama
2             Crime
2             Drama
3            Action
3             Crime
...             ...
249           Drama
249         Western
250       Biography
250           Drama
250         History

[626 rows x 1 columns]

SCHEMA of genres

<class 'pandas.core.frame.DataFrame'>
Int64Index: 626 entries, 1 to 250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genre   626 non-null    string
dtypes: string(1)
memory usage: 9.8 KB
None


**Store Parquet**

In [9]:
genres.to_parquet('master/t_genres.parquet', index=True)

# t_actors

In [10]:
actors = pd.read_csv('staging/actors.csv', header=None)

In [11]:
# Create table and set column name for actors
actors = pd.DataFrame({'actor': [l for l in actors.iloc[:,0]]})

# Set index name
actors.index = range(1,251)
actors.index.name = 'movie_id'

# Transform column 'actor' from string to list
actors['actor'] = pd.Series(map(lambda x : x.strip('\'][\'').split('\', \''), actors['actor']), index=actors.index)

# Unpack values from the column 'actor'
actors = actors.explode('actor')

# Set column type to string
actors = actors.astype('string')

In [12]:
print(actors)
print('\nSCHEMA of actors\n')
print(actors.info())

                    actor
movie_id                 
1             Tim Robbins
1          Morgan Freeman
1              Bob Gunton
1          William Sadler
1            Clancy Brown
...                   ...
250       Geraldine James
250       Alyque Padamsee
250           Amrish Puri
250            Ian Bannen
250        Michael Bryant

[4360 rows x 1 columns]

SCHEMA of actors

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4360 entries, 1 to 250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   actor   4360 non-null   string
dtypes: string(1)
memory usage: 68.1 KB
None


**Store Parquet**

In [13]:
actors.to_parquet('master/t_actors.parquet', index=True)

# t_writers

In [14]:
writers = pd.read_csv('staging/writers.csv', header=None)

In [15]:
# Create table and set column name for writers
writers = pd.DataFrame({'writer': [l for l in writers.iloc[:,0]]})

# Set index name
writers.index = range(1,251)
writers.index.name = 'movie_id'

# Transform column 'writer' from string to list
writers['writer'] = pd.Series(map(lambda x : x.strip('\'][\'').split('\', \''), writers['writer']), index=writers.index)

# Unpack values from the column 'writer'
writers = writers.explode('writer')

# Set column type to string
writers = writers.astype('string')

In [16]:
print(writers)
print('\nSCHEMA of writers\n')
print(writers.info())

                        writer
movie_id                      
1                 Stephen King
1               Frank Darabont
2                   Mario Puzo
2         Francis Ford Coppola
3                   Mario Puzo
...                        ...
248             Akira Kurosawa
248              Yuriy Nagibin
248           Vladimir Arsenev
249              Michael Blake
250                John Briley

[486 rows x 1 columns]

SCHEMA of writers

<class 'pandas.core.frame.DataFrame'>
Int64Index: 486 entries, 1 to 250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   writer  486 non-null    string
dtypes: string(1)
memory usage: 7.6 KB
None


**Store Parquet**

In [17]:
writers.to_parquet('master/t_writers.parquet', index=True)

# t_directors

In [18]:
directors = pd.read_csv('staging/directors.csv', header=None)

In [19]:
# Create table and set column name for directors
directors = pd.DataFrame({'director': [l for l in directors.iloc[:,0]]})

# Set index name
directors.index = range(1,251)
directors.index.name = 'movie_id'

# Transform column 'director' from string to list
directors['director'] = pd.Series(map(lambda x : x.strip('\'][\'').split('\', \''), directors['director']), index=directors.index)

# Unpack values from the column 'director'
directors = directors.explode('director')

# Set column type to string
directors = directors.astype('string')

In [20]:
print(directors)
print('\nSCHEMA of directors\n')
print(directors.info())

                      director
movie_id                      
1               Frank Darabont
2         Francis Ford Coppola
3            Christopher Nolan
4         Francis Ford Coppola
5                 Sidney Lumet
...                        ...
247             Akira Kurosawa
248               Ron Clements
248                John Musker
249              Kevin Costner
250       Richard Attenborough

[277 rows x 1 columns]

SCHEMA of directors

<class 'pandas.core.frame.DataFrame'>
Int64Index: 277 entries, 1 to 250
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   director  277 non-null    string
dtypes: string(1)
memory usage: 4.3 KB
None


**Store Parquet**

In [21]:
directors.to_parquet('master/t_directors.parquet', index=True)

# t_reviews

In [22]:
reviews = pd.read_csv('staging/reviews.csv', header=None)

In [23]:
# Trim square braces and quotes at the begining and end of the reviews
reviews = pd.DataFrame({'review': [s[2:-2] for s in reviews.iloc[:,0]]})

# Set index name
reviews.index = range(1,251)
reviews.index.name = 'movie_id'

# Set column type to string
reviews = reviews.astype('string')

In [24]:
print(reviews)
print('\nSCHEMA of reviews\n')
print(reviews.info())

                                                     review
movie_id                                                   
1         I've lost count of the number of times I have ...
2         This isn't just a beautifully crafted gangster...
3         Best movie ever. Heath ledger's work is phenom...
4         Francis Coppola and Mario Puzo continue their ...
5         '12 Angry Men' is an outstanding film. It is p...
...                                                     ...
246       I just returned from seeing a special preview ...
247       For a variety of reasons (that are well known ...
248       This movie has three elements that are importa...
249       `Dances With Wolves'When I first saw the movie...
250       As soon as I finished watching Gandhi, I thoug...

[250 rows x 1 columns]

SCHEMA of reviews

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 1 to 250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 


**Store Parquet**

In [25]:
reviews.to_parquet('master/t_reviews.parquet', index=True)