## Imports

In [2]:
import pandas as pd

## Functions

In [2]:
def tsv_to_dataframe(path, filter_column = '', filter_value = ''):
    chunks = []
    for chunk in pd.read_csv(filepath_or_buffer=path, sep='\t', chunksize=10000):
        if len(filter_column) > 0 and len(filter_value) > 0:
            filtered_chunk = chunk[chunk[filter_column] == filter_value]
        else:
            filtered_chunk = chunk
        chunks.append(filtered_chunk)
    return pd.concat(chunks, ignore_index=True)

## Unpacking the *.tsv into dataframes

### Filepaths

In [3]:
title_path = 'data\\title.basics.tsv'
crew_path = 'data\\title.crew.tsv'
ratings_path = 'data\\title.ratings.tsv'
principals_path = 'data\\title.principals.tsv'
names_path = 'data\\name.basics.tsv'
alternate_path = 'data\\title.akas.tsv'

### Merging *.tsv files to a single dataframe

In [4]:
title = tsv_to_dataframe(path=title_path, filter_column='titleType', filter_value='movie')
crew = tsv_to_dataframe(path=crew_path)
dataset = pd.merge(title, crew, on='tconst', how='inner')
del crew, title

In [5]:
ratings = tsv_to_dataframe(path=ratings_path)
dataset = pd.merge(dataset, ratings, on='tconst', how='inner')
del ratings

In [6]:
principals = tsv_to_dataframe(path=principals_path)
dataset = pd.merge(dataset, principals, on='tconst', how='inner')
del principals

In [7]:
names = tsv_to_dataframe(path=names_path)
dataset = pd.merge(dataset, names, how='inner', on='nconst')
del names

## Unpacking comma separated string

In [None]:
dataset[['known_for_movie_1', 'known_for_movie_2', 'known_for_movie_3', 'known_for_movie_4']] = dataset['knownForTitles'].str.split(',', expand=True)
dataset[['genre_1', 'genre_2', 'genre_3']] = dataset['genres'].str.split(',', expand=True)

In [7]:
dataset.drop('knownForTitles', axis=1, inplace=True)
dataset.drop('endYear', axis=1, inplace=True)
dataset.drop('genres', axis=1, inplace=True)

## Writing to csv-file for later use

In [8]:
dataset.to_csv('interrim_dataset.csv', sep =';')