In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(69)
%matplotlib inline

In [2]:
movie_df = pd.read_csv('data/movies.txt', delimiter='\t', header=None,
                       names=["Movie Id", "Movie Title", "Unknown", "Action", 
                              "Adventure", "Animation", "Childrens", "Comedy", 
                              "Crime", "Documentary", "Drama", "Fantasy", 
                              "Film-Noir", "Horror", "Musical", "Mystery", 
                              "Romance", "Sci-Fi", "Thriller", "War", 
                              "Western"])

data_df = pd.read_csv('data/data.txt', delimiter='\t', header=None,
                      names=["User Id", "Movie Id", "Rating"])

### Clean Data

In [3]:
# Get rid of duplicate movies
for title, sub_df in movie_df.groupby(['Movie Title']):
    if (len(sub_df) > 1):
        # Get of equivalent ids along with representative (lowest id)
        eq_ids, rep = set(sub_df["Movie Id"]), sub_df["Movie Id"].values[0]
        
        data_df.loc[data_df["Movie Id"].isin(eq_ids), "Movie Id"] = rep

cols = list(movie_df.columns); cols.remove("Movie Id")
movie_df = movie_df.drop_duplicates(subset = cols, keep='first')

In [4]:
# Make train and test dfs
n_train = 9 * data_df.shape[0] // 10
inds = np.random.permutation(data_df.shape[0])
train_df = data_df.loc[:n_train]
test_df = data_df.loc[n_train:]

# Dump movies not in train
reviewed = set(movie_df["Movie Id"]).intersection(set(train_df["Movie Id"]))
movie_df = movie_df.loc[movie_df["Movie Id"].isin(reviewed), :]
data_df = data_df.loc[data_df["Movie Id"].isin(reviewed), :]
test_df = test_df.loc[test_df["Movie Id"].isin(reviewed), :]

### Reset Ids

In [5]:
movie_df = movie_df.reset_index(drop=True)
# Use one-indexing
to_replace = {Id: (index + 1) for Id, index in zip(movie_df["Movie Id"], movie_df.index)}
movie_df["Movie Id"] = movie_df["Movie Id"].map(to_replace)

data_df["Movie Id"] = data_df["Movie Id"].map(to_replace)
train_df["Movie Id"] = train_df["Movie Id"].map(to_replace)
test_df["Movie Id"] = test_df["Movie Id"].map(to_replace)

# Make sure it worked?
assert(len(set(movie_df["Movie Title"])) == len(movie_df))
assert(len(set(movie_df["Movie Id"])) == len(set(train_df["Movie Id"])))

movie_df.tail(3) 

Unnamed: 0,Movie Id,Movie Title,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1643,1644,Sweet Nothing (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1644,1645,Mat' i syn (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1645,1646,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


### Dump cleaned csvs

In [6]:
train_df.to_csv('data/train_c.txt', sep='\t', header=False, index=False)
test_df.to_csv('data/test_c.txt', sep='\t', header=False, index=False)
data_df.to_csv('data/data_c.txt', sep='\t', header=False, index=False)
movie_df.to_csv('data/movies_c.txt', sep='\t', header=False, index=False)