In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(69)
%matplotlib inline

In [2]:
movie_df = pd.read_csv('data/movies.txt', delimiter='\t', header=None,
                       names=["Movie Id", "Movie Title", "Unknown", "Action", 
                              "Adventure", "Animation", "Childrens", "Comedy", 
                              "Crime", "Documentary", "Drama", "Fantasy", 
                              "Film-Noir", "Horror", "Musical", "Mystery", 
                              "Romance", "Sci-Fi", "Thriller", "War", 
                              "Western"])

data_df = pd.read_csv('data/data.txt', delimiter='\t', header=None,
                      names=["User Id", "Movie Id", "Rating"])

### Clean Data

In [3]:
# Get rid of duplicate movies
for title, sub_df in movie_df.groupby(['Movie Title']):
    if (len(sub_df) > 1):
        # Get of equivalent ids along with representative (lowest id)
        eq_ids, rep = set(sub_df["Movie Id"]), sub_df["Movie Id"].values[0]
        
        data_df.loc[data_df["Movie Id"].isin(eq_ids), "Movie Id"] = rep

cols = list(movie_df.columns); cols.remove("Movie Id")
movie_df = movie_df.drop_duplicates(subset = cols, keep='first')

## Get rid of movies with no reviews
reviewed = set(movie_df["Movie Id"]).intersection(set(data_df["Movie Id"]))
movie_df = movie_df.loc[movie_df["Movie Id"].isin(reviewed), :]

# Make sure it worked?
assert(len(set(movie_df["Movie Title"])) == len(movie_df))
assert(set(movie_df["Movie Id"]) == set(data_df["Movie Id"]))

### Reset Ids

In [4]:
movie_df = movie_df.reset_index(drop=True)
to_replace = {Id: index for Id, index in zip(movie_df["Movie Id"], movie_df.index)}
movie_df["Movie Id"] = movie_df["Movie Id"].map(to_replace)
data_df["Movie Id"] = data_df["Movie Id"].map(to_replace)
movie_df.tail(3) # Make sure Movie Id col and actual id's line up

Unnamed: 0,Movie Id,Movie Title,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1661,1661,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1662,1662,You So Crazy (1994),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1663,1663,Scream of Stone (Schrei aus Stein) (1991),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Dump cleaned csvs

In [5]:
n_train = 9 * data_df.shape[0] // 10
inds = np.random.permutation(data_df.shape[0])

train_df = data_df.loc[:n_train]
train_df.to_csv('data/train_c.txt', sep='\t', header=False, index=False)
test_df = data_df.loc[n_train:]
test_df.to_csv('data/test_c.txt', sep='\t', header=False, index=False)

data_df.to_csv('data/data_c.txt', sep='\t', header=False, index=False)
movie_df.to_csv('data/movies_c.txt', sep='\t', header=False, index=False)